def test_base_features_not_in_list(es): max_primitive = Max() mult_primitive = MultiplyNumericScalar(value=2) value = ft.IdentityFeature(es["log"].ww["value"]) value_x2 = ft.TransformFeature(value, mult_primitive) max_feat = ft.AggregationFeature(value_x2, "sessions", max_primitive) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name()], "feature_definitions": { max_feat.unique_name(): max_feat.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } dictionary["primitive_definitions"] = { "0": serialize_primitive(max_primitive), "1": serialize_primitive(mult_primitive), } dictionary["feature_definitions"][ max_feat.unique_name()]["arguments"]["primitive"] = "0" dictionary["feature_definitions"][ value_x2.unique_name()]["arguments"]["primitive"] = "1" deserializer = FeaturesDeserializer(dictionary) expected = [max_feat] assert expected == deserializer.to_list()
def test_feature_use_previous_pd_dateoffset(es): value = ft.IdentityFeature(es["log"].ww["id"]) do = pd.DateOffset(months=3) count_primitive = Count() count_feature = ft.AggregationFeature(value, "customers", count_primitive, use_previous=do) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [count_feature.unique_name(), value.unique_name()], "feature_definitions": { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } dictionary["primitive_definitions"] = { "0": serialize_primitive(count_primitive) } dictionary["feature_definitions"][ count_feature.unique_name()]["arguments"]["primitive"] = "0" deserializer = FeaturesDeserializer(dictionary) expected = [count_feature, value] assert expected == deserializer.to_list() value = ft.IdentityFeature(es["log"].ww["id"]) do = pd.DateOffset(months=3, days=2, minutes=30) count_feature = ft.AggregationFeature(value, "customers", count_primitive, use_previous=do) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [count_feature.unique_name(), value.unique_name()], "feature_definitions": { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } dictionary["primitive_definitions"] = { "0": serialize_primitive(count_primitive) } dictionary["feature_definitions"][ count_feature.unique_name()]["arguments"]["primitive"] = "0" deserializer = FeaturesDeserializer(dictionary) expected = [count_feature, value] assert expected == deserializer.to_list()
def test_where_feature_dependency(es): max_primitive = Max() value = ft.IdentityFeature(es["log"].ww["value"]) is_purchased = ft.IdentityFeature(es["log"].ww["purchased"]) max_feature = ft.AggregationFeature( value, "sessions", max_primitive, where=is_purchased ) features = [max_feature] serializer = FeaturesSerializer(features) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feature.unique_name()], "feature_definitions": { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), is_purchased.unique_name(): is_purchased.to_dictionary(), }, } expected["primitive_definitions"] = { "0": serialize_primitive(max_primitive), } expected["feature_definitions"][max_feature.unique_name()]["arguments"][ "primitive" ] = "0" actual = serializer.to_dict() _compare_feature_dicts(expected, actual)
def test_unknown_primitive_module(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feat = ft.AggregationFeature(value, "sessions", Max) primitive_dict = serialize_primitive(Max()) primitive_dict["module"] = "fake.module" dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name(), value.unique_name()], "feature_definitions": { max_feat.unique_name(): max_feat.to_dictionary(), value.unique_name(): value.to_dictionary(), }, "primitive_definitions": { "0": primitive_dict }, } with pytest.raises(RuntimeError) as excinfo: FeaturesDeserializer(dictionary) error_text = 'Primitive "Max" in module "fake.module" not found' assert error_text == str(excinfo.value)
def _feature_definitions(self): if not self._features_dict: self._features_dict = {} self._primitives_dict = {} for feature in self.feature_list: self._serialize_feature(feature) primitive_number = 0 primitive_id_to_key = {} for name, feature in self._features_dict.items(): primitive = feature["arguments"].get("primitive") if primitive: primitive_id = id(primitive) if primitive_id not in primitive_id_to_key.keys(): # Primitive we haven't seen before, add to dict and increment primitive_id counter # Always use string for keys because json conversion results in integer dict keys # being converted to strings, but integer dict values are not. primitives_dict_key = str(primitive_number) primitive_id_to_key[primitive_id] = primitives_dict_key self._primitives_dict[ primitives_dict_key ] = serialize_primitive(primitive) self._features_dict[name]["arguments"][ "primitive" ] = primitives_dict_key primitive_number += 1 else: # Primitive we have seen already - use existing primitive_id key key = primitive_id_to_key[primitive_id] self._features_dict[name]["arguments"]["primitive"] = key return self._features_dict, self._primitives_dict
def get_arguments(self): return { "name": self._name, "base_features": [feat.unique_name() for feat in self.base_features], "primitive": serialize_primitive(self.primitive), }
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es["log"].ww["id"]) td = pd.Timedelta(12, "W") count_primitive = Count() count_feature = ft.AggregationFeature( value, "customers", count_primitive, use_previous=td ) features = [count_feature, value] serializer = FeaturesSerializer(features) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [count_feature.unique_name(), value.unique_name()], "feature_definitions": { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } expected["primitive_definitions"] = {"0": serialize_primitive(count_primitive)} expected["feature_definitions"][count_feature.unique_name()]["arguments"][ "primitive" ] = "0" actual = serializer.to_dict() _compare_feature_dicts(expected, actual)
def get_arguments(self): return { 'name': self._name, 'base_features': [feat.unique_name() for feat in self.base_features], 'primitive': serialize_primitive(self.primitive) }
def test_serialization(es): primitives_deserializer = PrimitivesDeserializer() value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() max1 = ft.AggregationFeature(value, es['customers'], primitive) path = next(es.find_backward_paths('customers', 'log')) dictionary = { 'name': None, 'base_features': [value.unique_name()], 'relationship_path': [r.to_dictionary() for r in path], 'primitive': serialize_primitive(primitive), 'where': None, 'use_previous': None, } assert dictionary == max1.get_arguments() assert max1 == \ ft.AggregationFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, primitives_deserializer) is_purchased = ft.IdentityFeature(es['log']['purchased']) use_previous = ft.Timedelta(3, 'd') max2 = ft.AggregationFeature(value, es['customers'], primitive, where=is_purchased, use_previous=use_previous) dictionary = { 'name': None, 'base_features': [value.unique_name()], 'relationship_path': [r.to_dictionary() for r in path], 'primitive': serialize_primitive(primitive), 'where': is_purchased.unique_name(), 'use_previous': use_previous.get_arguments(), } assert dictionary == max2.get_arguments() dependencies = { value.unique_name(): value, is_purchased.unique_name(): is_purchased } assert max2 == \ ft.AggregationFeature.from_dictionary(dictionary, es, dependencies, primitives_deserializer)
def test_serialization(es): primitives_deserializer = PrimitivesDeserializer() value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() max1 = ft.AggregationFeature(value, "customers", primitive) path = next(es.find_backward_paths("customers", "log")) dictionary = { "name": None, "base_features": [value.unique_name()], "relationship_path": [r.to_dictionary() for r in path], "primitive": serialize_primitive(primitive), "where": None, "use_previous": None, } assert dictionary == max1.get_arguments() deserialized = ft.AggregationFeature.from_dictionary( dictionary, es, {value.unique_name(): value}, primitives_deserializer ) _assert_agg_feats_equal(max1, deserialized) is_purchased = ft.IdentityFeature(es["log"].ww["purchased"]) use_previous = ft.Timedelta(3, "d") max2 = ft.AggregationFeature( value, "customers", primitive, where=is_purchased, use_previous=use_previous ) dictionary = { "name": None, "base_features": [value.unique_name()], "relationship_path": [r.to_dictionary() for r in path], "primitive": serialize_primitive(primitive), "where": is_purchased.unique_name(), "use_previous": use_previous.get_arguments(), } assert dictionary == max2.get_arguments() dependencies = { value.unique_name(): value, is_purchased.unique_name(): is_purchased, } deserialized = ft.AggregationFeature.from_dictionary( dictionary, es, dependencies, primitives_deserializer ) _assert_agg_feats_equal(max2, deserialized)
def get_arguments(self): return { 'name': self._name, 'base_features': [feat.unique_name() for feat in self.base_features], 'relationship_path': [r.to_dictionary() for _, r in self.relationship_path], 'primitive': serialize_primitive(self.primitive), 'where': self.where and self.where.unique_name(), 'use_previous': self.use_previous and self.use_previous.get_arguments(), }
def test_serialization(es): primitives_deserializer = PrimitivesDeserializer() value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() max1 = ft.AggregationFeature(value, es['sessions'], primitive) dictionary = { 'base_features': [value.unique_name()], 'parent_entity_id': 'sessions', 'primitive': serialize_primitive(primitive), 'where': None, 'use_previous': None, } assert dictionary == max1.get_arguments() assert max1 == \ ft.AggregationFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, primitives_deserializer) is_purchased = ft.IdentityFeature(es['log']['purchased']) use_previous = ft.Timedelta(3, 'd') max2 = ft.AggregationFeature(value, es['sessions'], primitive, where=is_purchased, use_previous=use_previous) dictionary = { 'base_features': [value.unique_name()], 'parent_entity_id': 'sessions', 'primitive': serialize_primitive(primitive), 'where': is_purchased.unique_name(), 'use_previous': use_previous.get_arguments(), } assert dictionary == max2.get_arguments() dependencies = { value.unique_name(): value, is_purchased.unique_name(): is_purchased } assert max2 == \ ft.AggregationFeature.from_dictionary(dictionary, es, dependencies, primitives_deserializer)
def get_arguments(self): # Do not include groupby in base_features. feature_names = [feat.unique_name() for feat in self.base_features if feat.unique_name() != self.groupby.unique_name()] return { 'name': self._name, 'base_features': feature_names, 'primitive': serialize_primitive(self.primitive), 'groupby': self.groupby.unique_name(), }
def test_multioutput_feature(es): value = ft.IdentityFeature(es["log"].ww["product_id"]) threecommon = NMostCommon() num_unique = NumUnique() tc = ft.Feature(value, parent_dataframe_name="sessions", primitive=threecommon) features = [tc, value] for i in range(3): features.append( ft.Feature( tc[i], parent_dataframe_name="customers", primitive=num_unique, )) features.append(tc[i]) flist = [feat.unique_name() for feat in features] fd = [feat.to_dictionary() for feat in features] fdict = dict(zip(flist, fd)) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": flist, "feature_definitions": fdict, } dictionary["primitive_definitions"] = { "0": serialize_primitive(threecommon), "1": serialize_primitive(num_unique), } dictionary["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0" dictionary["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1" dictionary["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1" dictionary["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1" deserializer = FeaturesDeserializer(dictionary).to_list() for i in range(len(features)): assert features[i].unique_name() == deserializer[i].unique_name()
def test_multi_output_features(es): product_id = ft.IdentityFeature(es["log"].ww["product_id"]) threecommon = NMostCommon() num_unique = NumUnique() tc = ft.Feature(product_id, parent_dataframe_name="sessions", primitive=threecommon) features = [tc, product_id] for i in range(3): features.append( ft.Feature( tc[i], parent_dataframe_name="customers", primitive=num_unique, ) ) features.append(tc[i]) serializer = FeaturesSerializer(features) flist = [feat.unique_name() for feat in features] fd = [feat.to_dictionary() for feat in features] fdict = dict(zip(flist, fd)) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": flist, "feature_definitions": fdict, } expected["primitive_definitions"] = { "0": serialize_primitive(threecommon), "1": serialize_primitive(num_unique), } expected["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0" expected["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1" expected["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1" expected["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1" actual = serializer.to_dict() _compare_feature_dicts(expected, actual)
def get_arguments(self): # Do not include groupby in base_features. feature_names = [ feat.unique_name() for feat in self.base_features if feat.unique_name() != self.groupby.unique_name() ] return { "name": self._name, "base_features": feature_names, "primitive": serialize_primitive(self.primitive), "groupby": self.groupby.unique_name(), }
def get_arguments(self): return { 'base_features': [feat.unique_name() for feat in self.base_features], 'parent_entity_id': self.parent_entity.id, 'primitive': serialize_primitive(self.primitive), 'where': self.where and self.where.unique_name(), 'use_previous': self.use_previous and self.use_previous.get_arguments(), }
def get_arguments(self): return { "name": self._name, "base_features": [feat.unique_name() for feat in self.base_features], "relationship_path": [r.to_dictionary() for _, r in self.relationship_path], "primitive": serialize_primitive(self.primitive), "where": self.where and self.where.unique_name(), "use_previous": self.use_previous and self.use_previous.get_arguments(), }
def test_serialization(es): value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.MultiplyNumericScalar(value=2) value_x2 = ft.TransformFeature(value, primitive) dictionary = { 'base_features': [value.unique_name()], 'primitive': serialize_primitive(primitive), } assert dictionary == value_x2.get_arguments() assert value_x2 == \ ft.TransformFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
def test_primitive_serialization(universal_sentence_encoder): sentences = pd.Series([ "", "I like to eat pizza", "The roller coaster was built in 1885.", "When will humans go to mars?", "Mitochondria is the powerhouse of the cell", ]) serialized_primitive = serialize_primitive(universal_sentence_encoder) deserializer = PrimitivesDeserializer() deserialized_primitive = deserializer.deserialize_primitive( serialized_primitive) a = pd.DataFrame(deserialized_primitive(sentences)) a = a.mean().round(7).to_numpy() b = np.array([-0.0007475, 0.0032088, 0.0018552, 0.0008256, 0.0028342]) np.testing.assert_array_almost_equal(a, b)
def test_serialization(pd_es): value = ft.IdentityFeature(pd_es["log"].ww["value"]) zipcode = ft.IdentityFeature(pd_es["log"].ww["zipcode"]) primitive = CumSum() groupby = ft.feature_base.GroupByTransformFeature(value, primitive, zipcode) dictionary = { "name": None, "base_features": [value.unique_name()], "primitive": serialize_primitive(primitive), "groupby": zipcode.unique_name(), } assert dictionary == groupby.get_arguments() dependencies = { value.unique_name(): value, zipcode.unique_name(): zipcode, } assert groupby == ft.feature_base.GroupByTransformFeature.from_dictionary( dictionary, pd_es, dependencies, PrimitivesDeserializer() )
def test_serialization(es): value = ft.IdentityFeature(es['log']['value']) zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby = ft.feature_base.GroupByTransformFeature(value, primitive, zipcode) dictionary = { 'base_features': [value.unique_name()], 'primitive': serialize_primitive(primitive), 'groupby': zipcode.unique_name(), } assert dictionary == groupby.get_arguments() dependencies = { value.unique_name(): value, zipcode.unique_name(): zipcode, } assert groupby == \ ft.feature_base.GroupByTransformFeature.from_dictionary(dictionary, es, dependencies, PrimitivesDeserializer())
def test_base_features_in_list(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feature = ft.AggregationFeature(value, "sessions", Max) features = [max_feature, value] serializer = FeaturesSerializer(features) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feature.unique_name(), value.unique_name()], "feature_definitions": { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } expected["primitive_definitions"] = {"0": serialize_primitive(Max())} expected["feature_definitions"][max_feature.unique_name()]["arguments"][ "primitive" ] = "0" actual = serializer.to_dict() _compare_feature_dicts(expected, actual)