Esempio n. 1
0
def test_two_relationships_to_single_entity(games_es):
    es = games_es
    home_team, away_team = es.relationships
    path = RelationshipPath([(False, home_team)])
    mean_at_home = ft.AggregationFeature(es['games']['home_team_score'],
                                         es['teams'],
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    path = RelationshipPath([(False, away_team)])
    mean_at_away = ft.AggregationFeature(es['games']['away_team_score'],
                                         es['teams'],
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    home_team_mean = ft.DirectFeature(mean_at_home,
                                      es['games'],
                                      relationship=home_team)
    away_team_mean = ft.DirectFeature(mean_at_away,
                                      es['games'],
                                      relationship=away_team)

    feature_set = FeatureSet([home_team_mean, away_team_mean])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 8, 28),
                                      feature_set=feature_set)
    df = calculator.run(np.array(range(3)))
    df = to_pandas(df, index='id', sort_index=True)

    assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all()
    assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all()
def test_direct_with_no_path(diamond_es):
    error_text = 'No relationship from "regions" to "customers" found.'
    with pytest.raises(RuntimeError, match=error_text):
        ft.DirectFeature(diamond_es['customers']['name'], diamond_es['regions'])

    error_text = 'No relationship from "customers" to "customers" found.'
    with pytest.raises(RuntimeError, match=error_text):
        ft.DirectFeature(diamond_es['customers']['name'], diamond_es['customers'])
def test_direct_with_multiple_possible_paths(games_es):
    error_text = "There are multiple relationships to the base entity. " \
                 "You must specify a relationship."
    with pytest.raises(RuntimeError, match=error_text):
        ft.DirectFeature(games_es['teams']['name'], games_es['games'])

    # Does not raise if path specified.
    relationship = next(r for r in games_es.get_forward_relationships('games')
                        if r.child_variable.id == 'home_team_id')
    feat = ft.DirectFeature(games_es['teams']['name'], games_es['games'],
                            relationship=relationship)
    assert feat.relationship_path_name() == 'teams[home_team_id]'
    assert feat.get_name() == 'teams[home_team_id].name'
def test_direct_with_invalid_init_args(diamond_es):
    customer_to_region = diamond_es.get_forward_relationships('customers')[0]
    error_text = 'child_entity must be the relationship child entity'
    with pytest.raises(AssertionError, match=error_text):
        ft.DirectFeature(diamond_es['regions']['name'], diamond_es['stores'],
                         relationship=customer_to_region)

    transaction_relationships = diamond_es.get_forward_relationships('transactions')
    transaction_to_store = next(r for r in transaction_relationships
                                if r.parent_entity.id == 'stores')
    error_text = 'Base feature must be defined on the relationship parent entity'
    with pytest.raises(AssertionError, match=error_text):
        ft.DirectFeature(diamond_es['regions']['name'], diamond_es['transactions'],
                         relationship=transaction_to_store)
Esempio n. 5
0
def test_feature_trie_with_needs_full_entity_direct(es):
    value = ft.IdentityFeature(es['log']['value'], )
    agg = ft.AggregationFeature(value,
                                es['sessions'],
                                primitive=ft.primitives.Mean)
    agg_of_agg = ft.AggregationFeature(agg,
                                       es['customers'],
                                       primitive=ft.primitives.Sum)
    direct = ft.DirectFeature(agg_of_agg, es['sessions'])
    trans = ft.TransformFeature(direct, ft.primitives.CumSum)

    features = [trans, agg]
    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (True, {direct.unique_name(), trans.unique_name()}, {agg.unique_name()})

    assert trie.get_node(agg.relationship_path).value == \
        (False, set(), {value.unique_name()})

    parent_node = trie.get_node(direct.relationship_path)
    assert parent_node.value == (True, {agg_of_agg.unique_name()}, set())

    child_through_parent_node = parent_node.get_node(
        agg_of_agg.relationship_path)
    assert child_through_parent_node.value == (True, {agg.unique_name()},
                                               set())

    assert child_through_parent_node.get_node(agg.relationship_path).value == \
        (True, {value.unique_name()}, set())
Esempio n. 6
0
def test_feature_trie_ignores_approximate_features(es):
    value = ft.IdentityFeature(es['log']['value'], )
    agg = ft.AggregationFeature(value,
                                es['sessions'],
                                primitive=ft.primitives.Mean)
    agg_of_agg = ft.AggregationFeature(agg,
                                       es['customers'],
                                       primitive=ft.primitives.Sum)
    direct = ft.DirectFeature(agg_of_agg, es['sessions'])
    features = [direct, agg]

    approximate_feature_trie = Trie(default=list,
                                    path_constructor=RelationshipPath)
    approximate_feature_trie.get_node(
        direct.relationship_path).value = [agg_of_agg]
    feature_set = FeatureSet(features,
                             approximate_feature_trie=approximate_feature_trie)
    trie = feature_set.feature_trie

    # Since agg_of_agg is ignored it and its dependencies should not be in the
    # trie.
    sub_trie = trie.get_node(direct.relationship_path)
    for _path, (_, _, features) in sub_trie:
        assert not features

    assert trie.value == (False, set(),
                          {direct.unique_name(),
                           agg.unique_name()})
    assert trie.get_node(agg.relationship_path).value == \
        (False, set(), {value.unique_name()})
Esempio n. 7
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = 'MyFeature'
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = [new_name] if len(original_names) == 1 else [
            new_name + '[{}]'.format(i) for i in range(len(original_names))
        ]
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es['log'].ww['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log'].ww['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, 'customers', primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es['customers'].ww['age']), 'sessions')
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log'].ww['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    multioutput_original = ft.Feature(es['log'].ww['product_id'],
                                      parent_dataframe_name='customers',
                                      primitive=NMostCommon(n=2))
    assert multioutput_original.get_name(
    ) == 'N_MOST_COMMON(log.product_id, n=2)'

    featureslice_original = ft.feature_base.FeatureOutputSlice(
        multioutput_original, 0)
    assert featureslice_original.get_name(
    ) == 'N_MOST_COMMON(log.product_id, n=2)[0]'

    feature_type_list = [
        identity_original, agg_original, direct_original, transform_original,
        groupby_original, multioutput_original, featureslice_original
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
def test_direct_with_multiple_possible_paths(diamond_es):
    error_text = "There are multiple possible paths to the base entity. " \
                 "You must specify a relationship path."
    with pytest.raises(RuntimeError, match=error_text):
        ft.DirectFeature(diamond_es['regions']['name'],
                         diamond_es['transactions'])

    transaction_relationships = diamond_es.get_forward_relationships(
        'transactions')
    transaction_to_customer = next(r for r in transaction_relationships
                                   if r.parent_entity.id == 'customers')
    customer_to_region = diamond_es.get_forward_relationships('customers')[0]
    # Does not raise if path specified.
    feat = ft.DirectFeature(
        diamond_es['regions']['name'],
        diamond_es['transactions'],
        relationship_path=[transaction_to_customer, customer_to_region])
    assert feat.get_name() == 'customers.regions.name'
def test_direct_with_single_possible_path(diamond_es):
    # This uses diamond_es to test that there being a cycle somewhere in the
    # graph doesn't cause an error.
    feat = ft.DirectFeature(diamond_es['customers']['name'],
                            diamond_es['transactions'])
    relationships = diamond_es.get_forward_relationships('transactions')
    relationship = next(r for r in relationships
                        if r.parent_entity.id == 'customers')
    assert feat.relationship_path == [relationship]
Esempio n. 10
0
def test_feature_trie_without_needs_full_entity(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es['countries']['name'])
    direct_name = ft.DirectFeature(country_name, es['regions'])
    amount = ft.IdentityFeature(es['transactions']['amount'])

    path_through_customers = backward_path(
        es, ['regions', 'customers', 'transactions'])
    through_customers = ft.AggregationFeature(
        amount,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers)
    path_through_stores = backward_path(es,
                                        ['regions', 'stores', 'transactions'])
    through_stores = ft.AggregationFeature(
        amount,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores)
    customers_to_transactions = backward_path(es,
                                              ['customers', 'transactions'])
    customers_mean = ft.AggregationFeature(
        amount,
        es['customers'],
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions)

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ['regions', 'customers'])
    mean_of_mean = ft.AggregationFeature(
        negation,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers)

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == \
        (False, set(), {country_name.unique_name()})
    assert trie.get_node(regions_to_customers).value == \
        (False, set(), {negation.unique_name(), customers_mean.unique_name()})
    regions_to_stores = backward_path(es, ['regions', 'stores'])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == \
        (False, set(), {amount.unique_name()})
    assert trie.get_node(path_through_stores).value == \
        (False, set(), {amount.unique_name()})
Esempio n. 11
0
def test_serialization(es):
    value = ft.IdentityFeature(es['log']['value'])
    direct = ft.DirectFeature(value, es['log'])

    dictionary = {
        'base_feature': value.unique_name(),
        'child_entity_id': 'log',
    }

    assert dictionary == direct.get_arguments()
    assert direct == \
        ft.DirectFeature.from_dictionary(dictionary, es,
                                         {value.unique_name(): value},
                                         PrimitivesDeserializer())
def test_serialization(es):
    value = ft.IdentityFeature(es['products']['rating'])
    direct = ft.DirectFeature(value, es['log'])

    log_to_products = next(r for r in es.get_forward_relationships('log')
                           if r.parent_entity.id == 'products')
    dictionary = {
        'base_feature': value.unique_name(),
        'relationship_path': [log_to_products.to_dictionary()],
    }

    assert dictionary == direct.get_arguments()
    assert direct == \
        ft.DirectFeature.from_dictionary(dictionary, es,
                                         {value.unique_name(): value},
                                         PrimitivesDeserializer())
Esempio n. 13
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        renamed = original.rename('MyFeature')
        assert renamed.get_name() == 'MyFeature'

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        assert deserialized.get_name() == 'MyFeature'

    identity_original = ft.IdentityFeature(es['log']['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log']['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, es['customers'], primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(es['customers']['age'], es['sessions'])
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log']['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    feature_type_list = [
        identity_original, agg_original, direct_original, transform_original,
        groupby_original
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
Esempio n. 14
0
def test_feature_trie_without_needs_full_dataframe(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es["countries"].ww["name"])
    direct_name = ft.DirectFeature(country_name, "regions")
    amount = ft.IdentityFeature(es["transactions"].ww["amount"])

    path_through_customers = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers,
    )
    path_through_stores = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores,
    )
    customers_to_transactions = backward_path(es, ["customers", "transactions"])
    customers_mean = ft.AggregationFeature(
        amount,
        "customers",
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions,
    )

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ["regions", "customers"])
    mean_of_mean = ft.AggregationFeature(
        negation,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers,
    )

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == (
        False,
        set(),
        {country_name.unique_name()},
    )
    assert trie.get_node(regions_to_customers).value == (
        False,
        set(),
        {negation.unique_name(), customers_mean.unique_name()},
    )
    regions_to_stores = backward_path(es, ["regions", "stores"])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == (
        False,
        set(),
        {amount.unique_name()},
    )
    assert trie.get_node(path_through_stores).value == (
        False,
        set(),
        {amount.unique_name()},
    )
Esempio n. 15
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = "MyFeature"
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = (
            [new_name]
            if len(original_names) == 1
            else [new_name + "[{}]".format(i) for i in range(len(original_names))]
        )
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es["log"].ww["value"])
    assert identity_original.get_name() == "value"

    value = ft.IdentityFeature(es["log"].ww["value"])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, "customers", primitive)
    assert agg_original.get_name() == "MAX(log.value)"

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es["customers"].ww["age"]), "sessions"
    )
    assert direct_original.get_name() == "customers.age"

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == "value * 2"

    zipcode = ft.IdentityFeature(es["log"].ww["zipcode"])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode
    )
    assert groupby_original.get_name() == "CUM_SUM(value) by zipcode"

    multioutput_original = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)"

    featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0)
    assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]"

    feature_type_list = [
        identity_original,
        agg_original,
        direct_original,
        transform_original,
        groupby_original,
        multioutput_original,
        featureslice_original,
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
Esempio n. 16
0
def test_direct_with_single_possible_path(es):
    feat = ft.DirectFeature(es['customers']['age'], es['sessions'])
    assert feat.relationship_path_name() == 'customers'
    assert feat.get_name() == 'customers.age'
def test_get_name_skips_relationships_when_single_possible_path(es):
    feat = ft.DirectFeature(es['customers']['age'], es['log'])
    assert feat.get_name() == 'customers.age'