def test_feature_trie_with_needs_full_entity(diamond_es): pd_es = diamond_es amount = ft.IdentityFeature(pd_es['transactions']['amount']) path_through_customers = backward_path( pd_es, ['regions', 'customers', 'transactions']) agg = ft.AggregationFeature(amount, pd_es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_customers) trans_of_agg = ft.TransformFeature(agg, ft.primitives.CumSum) path_through_stores = backward_path(pd_es, ['regions', 'stores', 'transactions']) trans = ft.TransformFeature(amount, ft.primitives.CumSum) agg_of_trans = ft.AggregationFeature(trans, pd_es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_stores) features = [agg, trans_of_agg, agg_of_trans] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == \ (True, {agg.unique_name(), trans_of_agg.unique_name()}, {agg_of_trans.unique_name()}) assert trie.get_node(path_through_customers).value == \ (True, {amount.unique_name()}, set()) assert trie.get_node(path_through_customers[:1]).value == (True, set(), set()) assert trie.get_node(path_through_stores).value == \ (True, {amount.unique_name(), trans.unique_name()}, set()) assert trie.get_node(path_through_stores[:1]).value == (False, set(), set())
def load_feature_plots(): es = ft.demo.load_mock_customer(return_entityset=True) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'getting_started/graphs/') agg_feat = ft.AggregationFeature(ft.IdentityFeature(es['sessions'].ww['session_id']), 'customers', ft.primitives.Count) trans_feat = ft.TransformFeature(ft.IdentityFeature(es['customers'].ww['join_date']), ft.primitives.TimeSincePrevious) demo_feat = ft.AggregationFeature(ft.TransformFeature(ft.IdentityFeature(es['transactions'].ww['transaction_time']), ft.primitives.Weekday), 'sessions', ft.primitives.Mode) ft.graph_feature(agg_feat, to_file=os.path.join(path, 'agg_feat.dot')) ft.graph_feature(trans_feat, to_file=os.path.join(path, 'trans_feat.dot')) ft.graph_feature(demo_feat, to_file=os.path.join(path, 'demo_feat.dot'))
def test_feature_trie_with_needs_full_entity_direct(es): value = ft.IdentityFeature(es['log']['value'], ) agg = ft.AggregationFeature(value, es['sessions'], primitive=ft.primitives.Mean) agg_of_agg = ft.AggregationFeature(agg, es['customers'], primitive=ft.primitives.Sum) direct = ft.DirectFeature(agg_of_agg, es['sessions']) trans = ft.TransformFeature(direct, ft.primitives.CumSum) features = [trans, agg] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == \ (True, {direct.unique_name(), trans.unique_name()}, {agg.unique_name()}) assert trie.get_node(agg.relationship_path).value == \ (False, set(), {value.unique_name()}) parent_node = trie.get_node(direct.relationship_path) assert parent_node.value == (True, {agg_of_agg.unique_name()}, set()) child_through_parent_node = parent_node.get_node( agg_of_agg.relationship_path) assert child_through_parent_node.value == (True, {agg.unique_name()}, set()) assert child_through_parent_node.get_node(agg.relationship_path).value == \ (True, {value.unique_name()}, set())
def test_base_features_not_in_list(es): max_primitive = Max() mult_primitive = MultiplyNumericScalar(value=2) value = ft.IdentityFeature(es["log"].ww["value"]) value_x2 = ft.TransformFeature(value, mult_primitive) max_feat = ft.AggregationFeature(value_x2, "sessions", max_primitive) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name()], "feature_definitions": { max_feat.unique_name(): max_feat.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } dictionary["primitive_definitions"] = { "0": serialize_primitive(max_primitive), "1": serialize_primitive(mult_primitive), } dictionary["feature_definitions"][ max_feat.unique_name()]["arguments"]["primitive"] = "0" dictionary["feature_definitions"][ value_x2.unique_name()]["arguments"]["primitive"] = "1" deserializer = FeaturesDeserializer(dictionary) expected = [max_feat] assert expected == deserializer.to_list()
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = 'MyFeature' original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = [new_name] if len(original_names) == 1 else [ new_name + '[{}]'.format(i) for i in range(len(original_names)) ] check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es['log'].ww['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log'].ww['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, 'customers', primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature( ft.IdentityFeature(es['customers'].ww['age']), 'sessions') assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log'].ww['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' multioutput_original = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) assert multioutput_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)' featureslice_original = ft.feature_base.FeatureOutputSlice( multioutput_original, 0) assert featureslice_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)[0]' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_feature_trie_with_needs_full_dataframe(diamond_es): pd_es = diamond_es amount = ft.IdentityFeature(pd_es["transactions"].ww["amount"]) path_through_customers = backward_path( pd_es, ["regions", "customers", "transactions"] ) agg = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_customers, ) trans_of_agg = ft.TransformFeature(agg, ft.primitives.CumSum) path_through_stores = backward_path(pd_es, ["regions", "stores", "transactions"]) trans = ft.TransformFeature(amount, ft.primitives.CumSum) agg_of_trans = ft.AggregationFeature( trans, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_stores, ) features = [agg, trans_of_agg, agg_of_trans] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == ( True, {agg.unique_name(), trans_of_agg.unique_name()}, {agg_of_trans.unique_name()}, ) assert trie.get_node(path_through_customers).value == ( True, {amount.unique_name()}, set(), ) assert trie.get_node(path_through_customers[:1]).value == (True, set(), set()) assert trie.get_node(path_through_stores).value == ( True, {amount.unique_name(), trans.unique_name()}, set(), ) assert trie.get_node(path_through_stores[:1]).value == (False, set(), set())
def load_feature_plots(): es = ft.demo.load_mock_customer(return_entityset=True) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'automated_feature_engineering/graphs/') agg_feat = ft.AggregationFeature(es['sessions']['session_id'], es['customers'], ft.primitives.Count) trans_feat = ft.TransformFeature(es['customers']['join_date'], ft.primitives.TimeSincePrevious) ft.graph_feature(agg_feat, to_file=os.path.join(path, 'agg_feat.dot')) ft.graph_feature(trans_feat, to_file=os.path.join(path, 'trans_feat.dot'))
def test_feature_trie_without_needs_full_entity(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es['countries']['name']) direct_name = ft.DirectFeature(country_name, es['regions']) amount = ft.IdentityFeature(es['transactions']['amount']) path_through_customers = backward_path( es, ['regions', 'customers', 'transactions']) through_customers = ft.AggregationFeature( amount, es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_customers) path_through_stores = backward_path(es, ['regions', 'stores', 'transactions']) through_stores = ft.AggregationFeature( amount, es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_stores) customers_to_transactions = backward_path(es, ['customers', 'transactions']) customers_mean = ft.AggregationFeature( amount, es['customers'], primitive=ft.primitives.Mean, relationship_path=customers_to_transactions) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ['regions', 'customers']) mean_of_mean = ft.AggregationFeature( negation, es['regions'], primitive=ft.primitives.Mean, relationship_path=regions_to_customers) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == \ (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == \ (False, set(), {country_name.unique_name()}) assert trie.get_node(regions_to_customers).value == \ (False, set(), {negation.unique_name(), customers_mean.unique_name()}) regions_to_stores = backward_path(es, ['regions', 'stores']) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == \ (False, set(), {amount.unique_name()}) assert trie.get_node(path_through_stores).value == \ (False, set(), {amount.unique_name()})
def test_serialization(es): value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.MultiplyNumericScalar(value=2) value_x2 = ft.TransformFeature(value, primitive) dictionary = { 'base_features': [value.unique_name()], 'primitive': serialize_primitive(primitive), } assert dictionary == value_x2.get_arguments() assert value_x2 == \ ft.TransformFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
def test_base_features_not_in_list(es): value = ft.IdentityFeature(es["log"].ww["value"]) value_x2 = ft.TransformFeature( value, ft.primitives.MultiplyNumericScalar(value=2)) max_feat = ft.AggregationFeature(value_x2, "sessions", ft.primitives.Max) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name()], "feature_definitions": { max_feat.unique_name(): max_feat.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) expected = [max_feat] assert expected == deserializer.to_list()
def test_base_features_not_in_list(es): value = ft.IdentityFeature(es['log']['value']) value_x2 = ft.TransformFeature(value, ft.primitives.MultiplyNumericScalar(value=2)) max_feat = ft.AggregationFeature(value_x2, es['sessions'], ft.primitives.Max) dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feat.unique_name()], 'feature_definitions': { max_feat.unique_name(): max_feat.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) expected = [max_feat] assert expected == deserializer.to_list()
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): renamed = original.rename('MyFeature') assert renamed.get_name() == 'MyFeature' serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] assert deserialized.get_name() == 'MyFeature' identity_original = ft.IdentityFeature(es['log']['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, es['customers'], primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature(es['customers']['age'], es['sessions']) assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_base_features_not_in_list(es): value = ft.IdentityFeature(es['log'].ww['value']) value_x2 = ft.TransformFeature( value, ft.primitives.MultiplyNumericScalar(value=2)) max_feature = ft.AggregationFeature(value_x2, 'sessions', ft.primitives.Max) features = [max_feature] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feature.unique_name()], 'feature_definitions': { max_feature.unique_name(): max_feature.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_relationship_path(es): f = ft.TransformFeature(es['log']['datetime'], Hour) assert len(f.relationship_path) == 0
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = "MyFeature" original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = ( [new_name] if len(original_names) == 1 else [new_name + "[{}]".format(i) for i in range(len(original_names))] ) check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es["log"].ww["value"]) assert identity_original.get_name() == "value" value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, "customers", primitive) assert agg_original.get_name() == "MAX(log.value)" direct_original = ft.DirectFeature( ft.IdentityFeature(es["customers"].ww["age"]), "sessions" ) assert direct_original.get_name() == "customers.age" primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == "value * 2" zipcode = ft.IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode ) assert groupby_original.get_name() == "CUM_SUM(value) by zipcode" multioutput_original = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)" featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]" feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original, ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_feature_trie_without_needs_full_dataframe(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es["countries"].ww["name"]) direct_name = ft.DirectFeature(country_name, "regions") amount = ft.IdentityFeature(es["transactions"].ww["amount"]) path_through_customers = backward_path(es, ["regions", "customers", "transactions"]) through_customers = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_customers, ) path_through_stores = backward_path(es, ["regions", "stores", "transactions"]) through_stores = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_stores, ) customers_to_transactions = backward_path(es, ["customers", "transactions"]) customers_mean = ft.AggregationFeature( amount, "customers", primitive=ft.primitives.Mean, relationship_path=customers_to_transactions, ) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ["regions", "customers"]) mean_of_mean = ft.AggregationFeature( negation, "regions", primitive=ft.primitives.Mean, relationship_path=regions_to_customers, ) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == ( False, set(), {country_name.unique_name()}, ) assert trie.get_node(regions_to_customers).value == ( False, set(), {negation.unique_name(), customers_mean.unique_name()}, ) regions_to_stores = backward_path(es, ["regions", "stores"]) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == ( False, set(), {amount.unique_name()}, ) assert trie.get_node(path_through_stores).value == ( False, set(), {amount.unique_name()}, )