def test_to_encode_features(pd_es):
    f1 = IdentityFeature(pd_es["log"].ww["product_id"])
    f2 = IdentityFeature(pd_es["log"].ww["value"])
    f3 = IdentityFeature(pd_es["log"].ww["datetime"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features,
                                              pd_es,
                                              instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, _ = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string and datetime as a date,
    # and not have the same shape as previous encoded matrix due to fewer encoded features
    to_encode = []
    feature_matrix_encoded, _ = encode_features(feature_matrix,
                                                features,
                                                to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
    assert feature_matrix_encoded['datetime'].dtype == "datetime64[ns]"
    assert feature_matrix_encoded['product_id'].dtype == "category"

    to_encode = ['value']
    feature_matrix_encoded, _ = encode_features(feature_matrix,
                                                features,
                                                to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
    assert feature_matrix_encoded['datetime'].dtype == "datetime64[ns]"
    assert feature_matrix_encoded['product_id'].dtype == "category"
def test_encode_features_handles_pass_columns(pd_es):
    f1 = IdentityFeature(pd_es["log"].ww["product_id"])
    f2 = IdentityFeature(pd_es["log"].ww["value"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame(
        {
            "instance_id": range(6),
            "time": pd_es["log"]["datetime"][0:6],
            "label": [i % 2 for i in range(6)],
        },
        columns=["instance_id", "time", "label"],
    )
    feature_matrix = calculate_feature_matrix(features, pd_es, cutoff_time)

    assert "label" in feature_matrix.columns

    feature_matrix_encoded, _ = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, _ = encode_features(
        feature_matrix, features, to_encode=to_encode
    )
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ["value"]
    feature_matrix_encoded, _ = encode_features(
        feature_matrix, features, to_encode=to_encode
    )
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    assert "label" in feature_matrix_encoded.columns
def test_encode_features_handles_pass_columns(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame(
        {
            'instance_id': range(6),
            'time': entityset['log'].df['datetime'][0:6],
            'label': [i % 2 for i in range(6)]
        },
        columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time)

    assert 'label' in feature_matrix.columns

    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    assert 'label' in feature_matrix_encoded.columns
def test_encode_features_handles_pass_columns(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame({'instance_id': range(6),
                                'time': entityset['log'].df['datetime'][0:6],
                                'label': [i % 2 for i in range(6)]},
                               columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time)

    assert 'label' in feature_matrix.columns

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    assert 'label' in feature_matrix_encoded.columns
def test_encode_features_handles_dictionary_input(pd_es):
    f1 = IdentityFeature(pd_es["log"].ww["product_id"])
    f2 = IdentityFeature(pd_es["log"].ww["purchased"])
    f3 = IdentityFeature(pd_es["log"].ww["session_id"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features,
                                              pd_es,
                                              instance_ids=range(16))
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features)
    true_values = [
        'product_id = coke zero', 'product_id = toothpaste',
        'product_id = car', 'product_id = brown bag',
        'product_id = taco clock',
        'product_id = Haribo sugar-free gummy bears', 'product_id is unknown',
        'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4',
        'session_id = 3', 'session_id = 5', 'session_id = 2',
        'session_id is unknown'
    ]
    assert len(features_encoded) == 15
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)

    top_n_dict = {}
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, top_n=top_n_dict)
    assert len(features_encoded) == 15
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)

    top_n_dict = {f1.get_name(): 4, f3.get_name(): 3}
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, top_n=top_n_dict)
    assert len(features_encoded) == 10
    true_values = [
        'product_id = coke zero', 'product_id = toothpaste',
        'product_id = car', 'product_id = brown bag', 'product_id is unknown',
        'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4',
        'session_id is unknown'
    ]
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)

    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, top_n=top_n_dict, include_unknown=False)
    true_values = [
        'product_id = coke zero', 'product_id = toothpaste',
        'product_id = car', 'product_id = brown bag', 'purchased',
        'session_id = 0', 'session_id = 1', 'session_id = 4'
    ]
    assert len(features_encoded) == 8
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)
Esempio n. 6
0
def test_encode_features_drop_first():
    df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']})
    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     drop_first=True, include_unknown=False)
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True,
                                                 include_unknown=False)

    assert len(features_enc.columns) == 2
def test_dask_errors_encode_features(dask_es):
    f1 = IdentityFeature(dask_es["log"].ww["product_id"])
    f2 = IdentityFeature(dask_es["log"].ww["purchased"])
    f3 = IdentityFeature(dask_es["log"].ww["value"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features,
                                              dask_es,
                                              instance_ids=[0, 1, 2, 3, 4, 5])
    error_text = "feature_matrix must be a Pandas DataFrame"

    with pytest.raises(TypeError, match=error_text):
        encode_features(feature_matrix, features)
Esempio n. 8
0
def test_inplace_encodes_features(es):
    f1 = IdentityFeature(es["log"]["product_id"])

    features = [f1]
    feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_shape = feature_matrix.shape
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert feature_matrix_encoded.shape != feature_matrix_shape
    assert feature_matrix.shape == feature_matrix_shape

    # inplace they should be the same
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True)
    assert feature_matrix_encoded.shape == feature_matrix.shape
def test_inplace_encodes_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])

    features = [f1]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_shape = feature_matrix.shape
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert feature_matrix_encoded.shape != feature_matrix_shape
    assert feature_matrix.shape == feature_matrix_shape

    # inplace they should be the same
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True)
    assert feature_matrix_encoded.shape == feature_matrix.shape
Esempio n. 10
0
def test_encode_features_catches_features_mismatch(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])
    f3 = IdentityFeature(entityset["log"]["session_id"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame({'instance_id': range(6),
                                'time': entityset['log'].df['datetime'][0:6],
                                'label': [i % 2 for i in range(6)]},
                               columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time)

    assert 'label' in feature_matrix.columns

    with pytest.raises(AssertionError):
        encode_features(feature_matrix, [f1, f3])
def test_encode_features_catches_features_mismatch(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])
    f3 = IdentityFeature(entityset["log"]["session_id"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame({'instance_id': range(6),
                                'time': entityset['log'].df['datetime'][0:6],
                                'label': [i % 2 for i in range(6)]},
                               columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, cutoff_time)

    assert 'label' in feature_matrix.columns

    with pytest.raises(AssertionError):
        encode_features(feature_matrix, [f1, f3])
def test_encodes_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["purchased"])
    f3 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert len(features_encoded) == 6

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=2)
    assert len(features_encoded) == 5

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features,
                                                               include_unknown=False)
    assert len(features_encoded) == 5
Esempio n. 13
0
def test_encodes_features(es):
    f1 = IdentityFeature(es["log"]["product_id"])
    f2 = IdentityFeature(es["log"]["purchased"])
    f3 = IdentityFeature(es["log"]["value"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert len(features_encoded) == 6

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=2)
    assert len(features_encoded) == 5

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features,
                                                               include_unknown=False)
    assert len(features_encoded) == 5
def test_to_encode_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
Esempio n. 15
0
def test_to_encode_features(es):
    f1 = IdentityFeature(es["log"]["product_id"])
    f2 = IdentityFeature(es["log"]["value"])

    features = [f1, f2]
    feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
def test_encode_features_drop_first():
    df = pd.DataFrame({"category": ["ao", "b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )
    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")
    features_enc, _ = encode_features(
        features, feature_defs, drop_first=True, include_unknown=False
    )
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(
        features, feature_defs, top_n=3, drop_first=True, include_unknown=False
    )

    assert len(features_enc.columns) == 2
def test_encode_features_catches_features_mismatch(pd_es):
    f1 = IdentityFeature(pd_es["log"].ww["product_id"])
    f2 = IdentityFeature(pd_es["log"].ww["value"])
    f3 = IdentityFeature(pd_es["log"].ww["session_id"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame(
        {
            'instance_id': range(6),
            'time': pd_es['log']['datetime'][0:6],
            'label': [i % 2 for i in range(6)]
        },
        columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, pd_es, cutoff_time)

    assert 'label' in feature_matrix.columns

    error_text = 'Feature session_id not found in feature matrix'
    with pytest.raises(AssertionError, match=error_text):
        encode_features(feature_matrix, [f1, f3])
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})

    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')

    # Specify unknown token for replacement
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     include_unknown=True)
    assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d',
                                          'category = c', 'category = b', 'category is unknown']
Esempio n. 19
0
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})

    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')

    # Specify unknown token for replacement
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     include_unknown=True)
    assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d',
                                          'category = c', 'category = b', 'category is unknown']
def test_encode_features_drop_first():
    df = pd.DataFrame({
        'category': ['ao', 'b', 'c', 'd', 'e']
    }).astype({'category': 'category'})
    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')
    features_enc, _ = encode_features(features,
                                      feature_defs,
                                      drop_first=True,
                                      include_unknown=False)
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(features,
                                                 feature_defs,
                                                 top_n=3,
                                                 drop_first=True,
                                                 include_unknown=False)

    assert len(features_enc.columns) == 2
Esempio n. 21
0
def test_encode_features_topn(entityset):
    topn = Feature(entityset['log']['product_id'],
                   parent_entity=entityset['customers'],
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=entityset,
                                 instance_ids=[0, 1, 2],
                                 target_entity="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.hash() in [feat.hash() for feat in feature_defs_enc]
    for name in topn.get_feature_names():
        assert name in features_enc.columns
def test_encode_features_topn(pd_es):
    topn = Feature(Feature(pd_es['log'].ww['product_id']),
                   parent_dataframe_name='customers',
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=pd_es,
                                 instance_ids=[0, 1, 2],
                                 target_dataframe_name="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.unique_name() in [
        feat.unique_name() for feat in feature_defs_enc
    ]
    for name in topn.get_feature_names():
        assert name in features_enc.columns
        assert features_enc.columns.tolist().count(name) == 1
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({"category": ["b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )

    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")

    features_enc, feature_defs_enc = encode_features(
        features, feature_defs, to_encode=["category"]
    )

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    pd.testing.assert_frame_equal(features_enc, features_calc)
    assert features_calc.ww._schema == features_enc.ww._schema
Esempio n. 24
0
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({'category': ['b', 'c', 'd', 'e']})

    pd_es = EntitySet('test')
    pd_es.entity_from_dataframe(entity_id='a',
                                dataframe=df,
                                index='index',
                                make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_entity='a')

    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     to_encode=['category'])

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    assert features_enc['category = e'].dtypes == bool
    assert features_enc['category = e'].dtypes == features_calc[
        'category = e'].dtypes
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({
        'category': ['b', 'c', 'd', 'e']
    }).astype({'category': 'category'})

    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')

    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     to_encode=['category'])

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    pd.testing.assert_frame_equal(features_enc, features_calc)
    assert features_calc.ww._schema == features_enc.ww._schema
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({"category": ["unknown", "b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )

    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")

    # Specify unknown token for replacement
    features_enc, _ = encode_features(features, feature_defs, include_unknown=True)
    assert list(features_enc.columns) == [
        "category = unknown",
        "category = e",
        "category = d",
        "category = c",
        "category = b",
        "category is unknown",
    ]