def test_two_relationships_to_single_entity(games_es): es = games_es home_team, away_team = es.relationships path = RelationshipPath([(False, home_team)]) mean_at_home = ft.AggregationFeature(es['games']['home_team_score'], es['teams'], relationship_path=path, primitive=ft.primitives.Mean) path = RelationshipPath([(False, away_team)]) mean_at_away = ft.AggregationFeature(es['games']['away_team_score'], es['teams'], relationship_path=path, primitive=ft.primitives.Mean) home_team_mean = ft.DirectFeature(mean_at_home, es['games'], relationship=home_team) away_team_mean = ft.DirectFeature(mean_at_away, es['games'], relationship=away_team) feature_set = FeatureSet([home_team_mean, away_team_mean]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 8, 28), feature_set=feature_set) df = calculator.run(np.array(range(3))) df = to_pandas(df, index='id', sort_index=True) assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all() assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all()
def test_direct_with_no_path(diamond_es): error_text = 'No relationship from "regions" to "customers" found.' with pytest.raises(RuntimeError, match=error_text): ft.DirectFeature(diamond_es['customers']['name'], diamond_es['regions']) error_text = 'No relationship from "customers" to "customers" found.' with pytest.raises(RuntimeError, match=error_text): ft.DirectFeature(diamond_es['customers']['name'], diamond_es['customers'])
def test_direct_with_multiple_possible_paths(games_es): error_text = "There are multiple relationships to the base entity. " \ "You must specify a relationship." with pytest.raises(RuntimeError, match=error_text): ft.DirectFeature(games_es['teams']['name'], games_es['games']) # Does not raise if path specified. relationship = next(r for r in games_es.get_forward_relationships('games') if r.child_variable.id == 'home_team_id') feat = ft.DirectFeature(games_es['teams']['name'], games_es['games'], relationship=relationship) assert feat.relationship_path_name() == 'teams[home_team_id]' assert feat.get_name() == 'teams[home_team_id].name'
def test_direct_with_invalid_init_args(diamond_es): customer_to_region = diamond_es.get_forward_relationships('customers')[0] error_text = 'child_entity must be the relationship child entity' with pytest.raises(AssertionError, match=error_text): ft.DirectFeature(diamond_es['regions']['name'], diamond_es['stores'], relationship=customer_to_region) transaction_relationships = diamond_es.get_forward_relationships('transactions') transaction_to_store = next(r for r in transaction_relationships if r.parent_entity.id == 'stores') error_text = 'Base feature must be defined on the relationship parent entity' with pytest.raises(AssertionError, match=error_text): ft.DirectFeature(diamond_es['regions']['name'], diamond_es['transactions'], relationship=transaction_to_store)
def test_feature_trie_with_needs_full_entity_direct(es): value = ft.IdentityFeature(es['log']['value'], ) agg = ft.AggregationFeature(value, es['sessions'], primitive=ft.primitives.Mean) agg_of_agg = ft.AggregationFeature(agg, es['customers'], primitive=ft.primitives.Sum) direct = ft.DirectFeature(agg_of_agg, es['sessions']) trans = ft.TransformFeature(direct, ft.primitives.CumSum) features = [trans, agg] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == \ (True, {direct.unique_name(), trans.unique_name()}, {agg.unique_name()}) assert trie.get_node(agg.relationship_path).value == \ (False, set(), {value.unique_name()}) parent_node = trie.get_node(direct.relationship_path) assert parent_node.value == (True, {agg_of_agg.unique_name()}, set()) child_through_parent_node = parent_node.get_node( agg_of_agg.relationship_path) assert child_through_parent_node.value == (True, {agg.unique_name()}, set()) assert child_through_parent_node.get_node(agg.relationship_path).value == \ (True, {value.unique_name()}, set())
def test_feature_trie_ignores_approximate_features(es): value = ft.IdentityFeature(es['log']['value'], ) agg = ft.AggregationFeature(value, es['sessions'], primitive=ft.primitives.Mean) agg_of_agg = ft.AggregationFeature(agg, es['customers'], primitive=ft.primitives.Sum) direct = ft.DirectFeature(agg_of_agg, es['sessions']) features = [direct, agg] approximate_feature_trie = Trie(default=list, path_constructor=RelationshipPath) approximate_feature_trie.get_node( direct.relationship_path).value = [agg_of_agg] feature_set = FeatureSet(features, approximate_feature_trie=approximate_feature_trie) trie = feature_set.feature_trie # Since agg_of_agg is ignored it and its dependencies should not be in the # trie. sub_trie = trie.get_node(direct.relationship_path) for _path, (_, _, features) in sub_trie: assert not features assert trie.value == (False, set(), {direct.unique_name(), agg.unique_name()}) assert trie.get_node(agg.relationship_path).value == \ (False, set(), {value.unique_name()})
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = 'MyFeature' original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = [new_name] if len(original_names) == 1 else [ new_name + '[{}]'.format(i) for i in range(len(original_names)) ] check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es['log'].ww['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log'].ww['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, 'customers', primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature( ft.IdentityFeature(es['customers'].ww['age']), 'sessions') assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log'].ww['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' multioutput_original = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) assert multioutput_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)' featureslice_original = ft.feature_base.FeatureOutputSlice( multioutput_original, 0) assert featureslice_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)[0]' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_direct_with_multiple_possible_paths(diamond_es): error_text = "There are multiple possible paths to the base entity. " \ "You must specify a relationship path." with pytest.raises(RuntimeError, match=error_text): ft.DirectFeature(diamond_es['regions']['name'], diamond_es['transactions']) transaction_relationships = diamond_es.get_forward_relationships( 'transactions') transaction_to_customer = next(r for r in transaction_relationships if r.parent_entity.id == 'customers') customer_to_region = diamond_es.get_forward_relationships('customers')[0] # Does not raise if path specified. feat = ft.DirectFeature( diamond_es['regions']['name'], diamond_es['transactions'], relationship_path=[transaction_to_customer, customer_to_region]) assert feat.get_name() == 'customers.regions.name'
def test_direct_with_single_possible_path(diamond_es): # This uses diamond_es to test that there being a cycle somewhere in the # graph doesn't cause an error. feat = ft.DirectFeature(diamond_es['customers']['name'], diamond_es['transactions']) relationships = diamond_es.get_forward_relationships('transactions') relationship = next(r for r in relationships if r.parent_entity.id == 'customers') assert feat.relationship_path == [relationship]
def test_feature_trie_without_needs_full_entity(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es['countries']['name']) direct_name = ft.DirectFeature(country_name, es['regions']) amount = ft.IdentityFeature(es['transactions']['amount']) path_through_customers = backward_path( es, ['regions', 'customers', 'transactions']) through_customers = ft.AggregationFeature( amount, es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_customers) path_through_stores = backward_path(es, ['regions', 'stores', 'transactions']) through_stores = ft.AggregationFeature( amount, es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_stores) customers_to_transactions = backward_path(es, ['customers', 'transactions']) customers_mean = ft.AggregationFeature( amount, es['customers'], primitive=ft.primitives.Mean, relationship_path=customers_to_transactions) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ['regions', 'customers']) mean_of_mean = ft.AggregationFeature( negation, es['regions'], primitive=ft.primitives.Mean, relationship_path=regions_to_customers) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == \ (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == \ (False, set(), {country_name.unique_name()}) assert trie.get_node(regions_to_customers).value == \ (False, set(), {negation.unique_name(), customers_mean.unique_name()}) regions_to_stores = backward_path(es, ['regions', 'stores']) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == \ (False, set(), {amount.unique_name()}) assert trie.get_node(path_through_stores).value == \ (False, set(), {amount.unique_name()})
def test_serialization(es): value = ft.IdentityFeature(es['log']['value']) direct = ft.DirectFeature(value, es['log']) dictionary = { 'base_feature': value.unique_name(), 'child_entity_id': 'log', } assert dictionary == direct.get_arguments() assert direct == \ ft.DirectFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
def test_serialization(es): value = ft.IdentityFeature(es['products']['rating']) direct = ft.DirectFeature(value, es['log']) log_to_products = next(r for r in es.get_forward_relationships('log') if r.parent_entity.id == 'products') dictionary = { 'base_feature': value.unique_name(), 'relationship_path': [log_to_products.to_dictionary()], } assert dictionary == direct.get_arguments() assert direct == \ ft.DirectFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): renamed = original.rename('MyFeature') assert renamed.get_name() == 'MyFeature' serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] assert deserialized.get_name() == 'MyFeature' identity_original = ft.IdentityFeature(es['log']['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, es['customers'], primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature(es['customers']['age'], es['sessions']) assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_feature_trie_without_needs_full_dataframe(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es["countries"].ww["name"]) direct_name = ft.DirectFeature(country_name, "regions") amount = ft.IdentityFeature(es["transactions"].ww["amount"]) path_through_customers = backward_path(es, ["regions", "customers", "transactions"]) through_customers = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_customers, ) path_through_stores = backward_path(es, ["regions", "stores", "transactions"]) through_stores = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_stores, ) customers_to_transactions = backward_path(es, ["customers", "transactions"]) customers_mean = ft.AggregationFeature( amount, "customers", primitive=ft.primitives.Mean, relationship_path=customers_to_transactions, ) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ["regions", "customers"]) mean_of_mean = ft.AggregationFeature( negation, "regions", primitive=ft.primitives.Mean, relationship_path=regions_to_customers, ) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == ( False, set(), {country_name.unique_name()}, ) assert trie.get_node(regions_to_customers).value == ( False, set(), {negation.unique_name(), customers_mean.unique_name()}, ) regions_to_stores = backward_path(es, ["regions", "stores"]) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == ( False, set(), {amount.unique_name()}, ) assert trie.get_node(path_through_stores).value == ( False, set(), {amount.unique_name()}, )
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = "MyFeature" original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = ( [new_name] if len(original_names) == 1 else [new_name + "[{}]".format(i) for i in range(len(original_names))] ) check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es["log"].ww["value"]) assert identity_original.get_name() == "value" value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, "customers", primitive) assert agg_original.get_name() == "MAX(log.value)" direct_original = ft.DirectFeature( ft.IdentityFeature(es["customers"].ww["age"]), "sessions" ) assert direct_original.get_name() == "customers.age" primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == "value * 2" zipcode = ft.IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode ) assert groupby_original.get_name() == "CUM_SUM(value) by zipcode" multioutput_original = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)" featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]" feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original, ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_direct_with_single_possible_path(es): feat = ft.DirectFeature(es['customers']['age'], es['sessions']) assert feat.relationship_path_name() == 'customers' assert feat.get_name() == 'customers.age'
def test_get_name_skips_relationships_when_single_possible_path(es): feat = ft.DirectFeature(es['customers']['age'], es['log']) assert feat.get_name() == 'customers.age'