Example #1
0
def test_feature_trie_with_needs_full_entity(diamond_es):
    pd_es = diamond_es
    amount = ft.IdentityFeature(pd_es['transactions']['amount'])

    path_through_customers = backward_path(
        pd_es, ['regions', 'customers', 'transactions'])
    agg = ft.AggregationFeature(amount,
                                pd_es['regions'],
                                primitive=ft.primitives.Mean,
                                relationship_path=path_through_customers)
    trans_of_agg = ft.TransformFeature(agg, ft.primitives.CumSum)

    path_through_stores = backward_path(pd_es,
                                        ['regions', 'stores', 'transactions'])
    trans = ft.TransformFeature(amount, ft.primitives.CumSum)
    agg_of_trans = ft.AggregationFeature(trans,
                                         pd_es['regions'],
                                         primitive=ft.primitives.Mean,
                                         relationship_path=path_through_stores)

    features = [agg, trans_of_agg, agg_of_trans]
    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (True, {agg.unique_name(), trans_of_agg.unique_name()}, {agg_of_trans.unique_name()})
    assert trie.get_node(path_through_customers).value == \
        (True, {amount.unique_name()}, set())
    assert trie.get_node(path_through_customers[:1]).value == (True, set(),
                                                               set())
    assert trie.get_node(path_through_stores).value == \
        (True, {amount.unique_name(), trans.unique_name()}, set())
    assert trie.get_node(path_through_stores[:1]).value == (False, set(),
                                                            set())
Example #2
0
def load_feature_plots():
    es = ft.demo.load_mock_customer(return_entityset=True)
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'getting_started/graphs/')
    agg_feat = ft.AggregationFeature(ft.IdentityFeature(es['sessions'].ww['session_id']),
                                     'customers', ft.primitives.Count)
    trans_feat = ft.TransformFeature(ft.IdentityFeature(es['customers'].ww['join_date']), ft.primitives.TimeSincePrevious)
    demo_feat = ft.AggregationFeature(ft.TransformFeature(ft.IdentityFeature(es['transactions'].ww['transaction_time']),
                                                          ft.primitives.Weekday),
                                      'sessions', ft.primitives.Mode)
    ft.graph_feature(agg_feat, to_file=os.path.join(path, 'agg_feat.dot'))
    ft.graph_feature(trans_feat, to_file=os.path.join(path, 'trans_feat.dot'))
    ft.graph_feature(demo_feat, to_file=os.path.join(path, 'demo_feat.dot'))
Example #3
0
def test_feature_trie_with_needs_full_entity_direct(es):
    value = ft.IdentityFeature(es['log']['value'], )
    agg = ft.AggregationFeature(value,
                                es['sessions'],
                                primitive=ft.primitives.Mean)
    agg_of_agg = ft.AggregationFeature(agg,
                                       es['customers'],
                                       primitive=ft.primitives.Sum)
    direct = ft.DirectFeature(agg_of_agg, es['sessions'])
    trans = ft.TransformFeature(direct, ft.primitives.CumSum)

    features = [trans, agg]
    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (True, {direct.unique_name(), trans.unique_name()}, {agg.unique_name()})

    assert trie.get_node(agg.relationship_path).value == \
        (False, set(), {value.unique_name()})

    parent_node = trie.get_node(direct.relationship_path)
    assert parent_node.value == (True, {agg_of_agg.unique_name()}, set())

    child_through_parent_node = parent_node.get_node(
        agg_of_agg.relationship_path)
    assert child_through_parent_node.value == (True, {agg.unique_name()},
                                               set())

    assert child_through_parent_node.get_node(agg.relationship_path).value == \
        (True, {value.unique_name()}, set())
def test_base_features_not_in_list(es):
    max_primitive = Max()
    mult_primitive = MultiplyNumericScalar(value=2)
    value = ft.IdentityFeature(es["log"].ww["value"])
    value_x2 = ft.TransformFeature(value, mult_primitive)
    max_feat = ft.AggregationFeature(value_x2, "sessions", max_primitive)
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feat.unique_name()],
        "feature_definitions": {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value_x2.unique_name(): value_x2.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }
    dictionary["primitive_definitions"] = {
        "0": serialize_primitive(max_primitive),
        "1": serialize_primitive(mult_primitive),
    }
    dictionary["feature_definitions"][
        max_feat.unique_name()]["arguments"]["primitive"] = "0"
    dictionary["feature_definitions"][
        value_x2.unique_name()]["arguments"]["primitive"] = "1"
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat]
    assert expected == deserializer.to_list()
Example #5
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = 'MyFeature'
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = [new_name] if len(original_names) == 1 else [
            new_name + '[{}]'.format(i) for i in range(len(original_names))
        ]
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es['log'].ww['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log'].ww['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, 'customers', primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es['customers'].ww['age']), 'sessions')
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log'].ww['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    multioutput_original = ft.Feature(es['log'].ww['product_id'],
                                      parent_dataframe_name='customers',
                                      primitive=NMostCommon(n=2))
    assert multioutput_original.get_name(
    ) == 'N_MOST_COMMON(log.product_id, n=2)'

    featureslice_original = ft.feature_base.FeatureOutputSlice(
        multioutput_original, 0)
    assert featureslice_original.get_name(
    ) == 'N_MOST_COMMON(log.product_id, n=2)[0]'

    feature_type_list = [
        identity_original, agg_original, direct_original, transform_original,
        groupby_original, multioutput_original, featureslice_original
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
Example #6
0
def test_feature_trie_with_needs_full_dataframe(diamond_es):
    pd_es = diamond_es
    amount = ft.IdentityFeature(pd_es["transactions"].ww["amount"])

    path_through_customers = backward_path(
        pd_es, ["regions", "customers", "transactions"]
    )
    agg = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers,
    )
    trans_of_agg = ft.TransformFeature(agg, ft.primitives.CumSum)

    path_through_stores = backward_path(pd_es, ["regions", "stores", "transactions"])
    trans = ft.TransformFeature(amount, ft.primitives.CumSum)
    agg_of_trans = ft.AggregationFeature(
        trans,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores,
    )

    features = [agg, trans_of_agg, agg_of_trans]
    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == (
        True,
        {agg.unique_name(), trans_of_agg.unique_name()},
        {agg_of_trans.unique_name()},
    )
    assert trie.get_node(path_through_customers).value == (
        True,
        {amount.unique_name()},
        set(),
    )
    assert trie.get_node(path_through_customers[:1]).value == (True, set(), set())
    assert trie.get_node(path_through_stores).value == (
        True,
        {amount.unique_name(), trans.unique_name()},
        set(),
    )
    assert trie.get_node(path_through_stores[:1]).value == (False, set(), set())
Example #7
0
def load_feature_plots():
    es = ft.demo.load_mock_customer(return_entityset=True)
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'automated_feature_engineering/graphs/')
    agg_feat = ft.AggregationFeature(es['sessions']['session_id'],
                                     es['customers'], ft.primitives.Count)
    trans_feat = ft.TransformFeature(es['customers']['join_date'],
                                     ft.primitives.TimeSincePrevious)
    ft.graph_feature(agg_feat, to_file=os.path.join(path, 'agg_feat.dot'))
    ft.graph_feature(trans_feat, to_file=os.path.join(path, 'trans_feat.dot'))
Example #8
0
def test_feature_trie_without_needs_full_entity(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es['countries']['name'])
    direct_name = ft.DirectFeature(country_name, es['regions'])
    amount = ft.IdentityFeature(es['transactions']['amount'])

    path_through_customers = backward_path(
        es, ['regions', 'customers', 'transactions'])
    through_customers = ft.AggregationFeature(
        amount,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers)
    path_through_stores = backward_path(es,
                                        ['regions', 'stores', 'transactions'])
    through_stores = ft.AggregationFeature(
        amount,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores)
    customers_to_transactions = backward_path(es,
                                              ['customers', 'transactions'])
    customers_mean = ft.AggregationFeature(
        amount,
        es['customers'],
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions)

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ['regions', 'customers'])
    mean_of_mean = ft.AggregationFeature(
        negation,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers)

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == \
        (False, set(), {country_name.unique_name()})
    assert trie.get_node(regions_to_customers).value == \
        (False, set(), {negation.unique_name(), customers_mean.unique_name()})
    regions_to_stores = backward_path(es, ['regions', 'stores'])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == \
        (False, set(), {amount.unique_name()})
    assert trie.get_node(path_through_stores).value == \
        (False, set(), {amount.unique_name()})
Example #9
0
def test_serialization(es):
    value = ft.IdentityFeature(es['log']['value'])
    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    value_x2 = ft.TransformFeature(value, primitive)

    dictionary = {
        'base_features': [value.unique_name()],
        'primitive': serialize_primitive(primitive),
    }

    assert dictionary == value_x2.get_arguments()
    assert value_x2 == \
        ft.TransformFeature.from_dictionary(dictionary, es,
                                            {value.unique_name(): value},
                                            PrimitivesDeserializer())
def test_base_features_not_in_list(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    value_x2 = ft.TransformFeature(
        value, ft.primitives.MultiplyNumericScalar(value=2))
    max_feat = ft.AggregationFeature(value_x2, "sessions", ft.primitives.Max)
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feat.unique_name()],
        "feature_definitions": {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value_x2.unique_name(): value_x2.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat]
    assert expected == deserializer.to_list()
def test_base_features_not_in_list(es):
    value = ft.IdentityFeature(es['log']['value'])
    value_x2 = ft.TransformFeature(value,
                                   ft.primitives.MultiplyNumericScalar(value=2))
    max_feat = ft.AggregationFeature(value_x2, es['sessions'], ft.primitives.Max)
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feat.unique_name()],
        'feature_definitions': {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value_x2.unique_name(): value_x2.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat]
    assert expected == deserializer.to_list()
Example #12
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        renamed = original.rename('MyFeature')
        assert renamed.get_name() == 'MyFeature'

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        assert deserialized.get_name() == 'MyFeature'

    identity_original = ft.IdentityFeature(es['log']['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log']['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, es['customers'], primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(es['customers']['age'], es['sessions'])
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log']['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    feature_type_list = [
        identity_original, agg_original, direct_original, transform_original,
        groupby_original
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
Example #13
0
def test_base_features_not_in_list(es):
    value = ft.IdentityFeature(es['log'].ww['value'])
    value_x2 = ft.TransformFeature(
        value, ft.primitives.MultiplyNumericScalar(value=2))
    max_feature = ft.AggregationFeature(value_x2, 'sessions',
                                        ft.primitives.Max)
    features = [max_feature]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feature.unique_name()],
        'feature_definitions': {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value_x2.unique_name(): value_x2.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
Example #14
0
def test_relationship_path(es):
    f = ft.TransformFeature(es['log']['datetime'], Hour)

    assert len(f.relationship_path) == 0
Example #15
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = "MyFeature"
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = (
            [new_name]
            if len(original_names) == 1
            else [new_name + "[{}]".format(i) for i in range(len(original_names))]
        )
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es["log"].ww["value"])
    assert identity_original.get_name() == "value"

    value = ft.IdentityFeature(es["log"].ww["value"])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, "customers", primitive)
    assert agg_original.get_name() == "MAX(log.value)"

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es["customers"].ww["age"]), "sessions"
    )
    assert direct_original.get_name() == "customers.age"

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == "value * 2"

    zipcode = ft.IdentityFeature(es["log"].ww["zipcode"])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode
    )
    assert groupby_original.get_name() == "CUM_SUM(value) by zipcode"

    multioutput_original = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)"

    featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0)
    assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]"

    feature_type_list = [
        identity_original,
        agg_original,
        direct_original,
        transform_original,
        groupby_original,
        multioutput_original,
        featureslice_original,
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
Example #16
0
def test_feature_trie_without_needs_full_dataframe(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es["countries"].ww["name"])
    direct_name = ft.DirectFeature(country_name, "regions")
    amount = ft.IdentityFeature(es["transactions"].ww["amount"])

    path_through_customers = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers,
    )
    path_through_stores = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores,
    )
    customers_to_transactions = backward_path(es, ["customers", "transactions"])
    customers_mean = ft.AggregationFeature(
        amount,
        "customers",
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions,
    )

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ["regions", "customers"])
    mean_of_mean = ft.AggregationFeature(
        negation,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers,
    )

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == (
        False,
        set(),
        {country_name.unique_name()},
    )
    assert trie.get_node(regions_to_customers).value == (
        False,
        set(),
        {negation.unique_name(), customers_mean.unique_name()},
    )
    regions_to_stores = backward_path(es, ["regions", "stores"])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == (
        False,
        set(),
        {amount.unique_name()},
    )
    assert trie.get_node(path_through_stores).value == (
        False,
        set(),
        {amount.unique_name()},
    )