def test_direct_of_multi_output_transform_feat(es): class TestTime(TransformPrimitive): name = "test_time" input_types = [Datetime] return_type = Numeric number_output_features = 6 def get_function(self): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] return test_f join_time_split = Feature(es["customers"]["signup_date"], primitive=TestTime) alt_features = [Feature(es["customers"]["signup_date"], primitive=Year), Feature(es["customers"]["signup_date"], primitive=Month), Feature(es["customers"]["signup_date"], primitive=Day), Feature(es["customers"]["signup_date"], primitive=Hour), Feature(es["customers"]["signup_date"], primitive=Minute), Feature(es["customers"]["signup_date"], primitive=Second)] fm, fl = dfs( entityset=es, target_entity="sessions", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second]) # Get column names of for multi feature and normal features subnames = DirectFeature(join_time_split, es["sessions"]).get_feature_names() altnames = [DirectFeature(f, es["sessions"]).get_name() for f in alt_features] # Check values are equal between for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all()
def test_make_dfeat_of_agg_feat_on_self(es): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the dataframe we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_dataframe_name="customers") feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id") v = df[num_customers_feat.get_name()].values[0] assert v == 3
def test_make_dfeat_of_agg_feat_through_parent(es, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(es['stores']['id']) store_count_feat = ft.Feature(store_id_feat, parent_entity=es[u'régions'], primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_entity=es['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_direct_description(es): feature = DirectFeature( IdentityFeature(es["customers"].ww["loves_ice_cream"]), "sessions" ) description = ( 'The "loves_ice_cream" for the instance of "customers" associated ' 'with this instance of "sessions".' ) assert describe_feature(feature) == description deep_direct = DirectFeature(feature, "log") deep_description = ( 'The "loves_ice_cream" for the instance of "customers" ' 'associated with the instance of "sessions" associated with ' 'this instance of "log".' ) assert describe_feature(deep_direct) == deep_description agg = AggregationFeature( IdentityFeature(es["log"].ww["purchased"]), "sessions", PercentTrue ) complicated_direct = DirectFeature(agg, "log") agg_on_direct = AggregationFeature(complicated_direct, "products", Mean) complicated_description = ( "The average of the percentage of true values in " 'the "purchased" of all instances of "log" for each "id" in "sessions" for ' 'the instance of "sessions" associated with this instance of "log" of all ' 'instances of "log" for each "id" in "products".' ) assert describe_feature(agg_on_direct) == complicated_description
def test_make_dfeat_of_agg_feat_on_self(es): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es['customers']['id'], parent_entity=es[u'régions'], primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_entity=es['customers']) feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_make_dfeat_of_agg_feat_through_parent(es): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(es['stores']['id']) store_count_feat = ft.Feature(store_id_feat, parent_entity=es[u'régions'], primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_entity=es['customers']) feature_set = FeatureSet([num_stores_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index='id') v = df[num_stores_feat.get_name()].values[0] assert (v == 3)
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = ft.Feature(es['log']["id"], parent_entity=es['sessions'], primitive=Count) seed_feature_log = ft.Feature(es['log']['datetime'], primitive=Hour) session_agg = ft.Feature(seed_feature_log, parent_entity=es['sessions'], primitive=Last) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature( ft.Feature(session_agg, parent_entity=es['customers'], primitive=Mode), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_make_dfeat(es, backend): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) pandas_backend = backend([f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert (v == 33)
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run([0, 5]) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) copy_feat = feat.rename("session_test") assert feat.hash() != copy_feat.hash() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_direct_copy(games_es): home_team = next(r for r in games_es.relationships if r.child_variable.id == 'home_team_id') feat = DirectFeature(games_es['teams']['name'], games_es['games'], relationship=home_team) copied = feat.copy() assert copied.entity == feat.entity assert copied.base_features == feat.base_features assert copied.relationship_path == feat.relationship_path
def test_direct_copy(games_es): home_team = next(r for r in games_es.relationships if r._child_column_name == 'home_team_id') feat = DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games', relationship=home_team) copied = feat.copy() assert copied.dataframe_name == feat.dataframe_name assert copied.base_features == feat.base_features assert copied.relationship_path == feat.relationship_path
def test_direct_with_no_path(diamond_es): error_text = 'No relationship from "regions" to "customers" found.' with pytest.raises(RuntimeError, match=error_text): DirectFeature(IdentityFeature(diamond_es['customers'].ww['name']), 'regions') error_text = 'No relationship from "customers" to "customers" found.' with pytest.raises(RuntimeError, match=error_text): DirectFeature(IdentityFeature(diamond_es['customers'].ww['name']), 'customers')
def test_direct_copy(games_es): home_team = next(r for r in games_es.relationships if r._child_column_name == "home_team_id") feat = DirectFeature(IdentityFeature(games_es["teams"].ww["name"]), "games", relationship=home_team) copied = feat.copy() assert copied.dataframe_name == feat.dataframe_name assert copied.base_features == feat.base_features assert copied.relationship_path == feat.relationship_path
def test_direct_rename_multioutput(es): n_common = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, es['sessions']) copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_make_dfeat(es): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert (v == 33)
def test_direct_rename_multioutput(es): n_common = Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, 'sessions') copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name( ) == copy_feat.base_features[0].generate_name() assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=IdentityFeature( es['sessions'].ww['device_type']), child_dataframe_name='log') copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name( ) == copy_feat.base_features[0].generate_name() assert feat.dataframe_name == copy_feat.dataframe_name
def test_make_dfeat(es): f = DirectFeature(ft.Feature(es['customers'].ww['age']), child_dataframe_name='sessions') feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert (v == 33)
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_groupby_transform_direct_groupby(es): groupby = DirectFeature(IdentityFeature(es['cohorts'].ww['cohort_name']), 'customers') feat = GroupByTransformFeature(IdentityFeature(es['customers'].ww['age']), CumMax, groupby) graph = graph_feature(feat).source groupby_name = groupby.get_name() feat_name = feat.get_name() join_node = '1_{}_join'.format(groupby_name) prim_node = "0_{}_cum_max".format(feat_name) groupby_node = '{}_groupby_customers--{}'.format(feat_name, groupby_name) customers_table = '\u2605 customers (target)' cohorts_table = 'cohorts' join_groupby = '"{}" -> customers:cohort'.format(join_node) join_input = 'cohorts:cohort_name -> "{}"'.format(join_node) join_out_edge = '"{}" -> customers:"{}"'.format(join_node, groupby_name) groupby_edge = 'customers:"{}" -> "{}"'.format(groupby_name, groupby_node) groupby_input = 'customers:age -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name) graph_components = [ groupby_name, feat_name, join_node, prim_node, groupby_node, customers_table, cohorts_table, join_groupby, join_input, join_out_edge, groupby_edge, groupby_input, prim_input, feat_edge ] for component in graph_components: assert component in graph dataframes = { 'cohorts': [cohorts_table, 'cohort_name'], 'customers': [customers_table, 'cohort', 'age', groupby_name, feat_name] } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_direct_rename_multioutput(es): n_common = Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) feat = DirectFeature(n_common, "sessions") copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert (feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()) assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) if isinstance(df, dd.DataFrame): df = df.compute().set_index('id').sort_index() v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_with_multiple_possible_paths(games_es): error_text = "There are multiple relationships to the base dataframe. " \ "You must specify a relationship." with pytest.raises(RuntimeError, match=error_text): DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games') # Does not raise if path specified. relationship = next(r for r in games_es.get_forward_relationships('games') if r._child_column_name == 'home_team_id') feat = DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games', relationship=relationship) assert feat.relationship_path_name() == 'teams[home_team_id]' assert feat.get_name() == 'teams[home_team_id].name'
def test_serialization(es): value = ft.IdentityFeature(es["products"].ww["rating"]) direct = DirectFeature(value, "log") log_to_products = next(r for r in es.get_forward_relationships("log") if r.parent_dataframe.ww.name == "products") dictionary = { "name": direct.get_name(), "base_feature": value.unique_name(), "relationship": log_to_products.to_dictionary(), } assert dictionary == direct.get_arguments() assert direct == DirectFeature.from_dictionary( dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(es): """ The graph looks like this (higher implies parent): C C = Customers, the entity we're trying to predict on | S = Sessions, a child of Customers P S L = Log, a child of both Sessions and Log \\ / P = Products, a parent of Log which is not a descendent of customers L We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and then aggregate it with another agg_feat of C on L. """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['products'], primitive=Count) product_purchases_feat = DirectFeature(log_count_feat, child_entity=es['log']) purchase_popularity = ft.Feature(product_purchases_feat, parent_entity=es['customers'], primitive=Mean) feature_set = FeatureSet([purchase_popularity]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index='id') v = df[purchase_popularity.get_name()].values[0] assert (v == 38.0 / 10.0)
def test_make_compare_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) mean_agg_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Mean) mean_feat = DirectFeature(mean_agg_feat, child_entity=es['sessions']) feat = log_count_feat > mean_feat feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) df = to_pandas(df, index='id', sort_index=True) name = feat.get_name() instances = df[name] v0, v1, v2 = instances[0:3] assert v0 assert v1 assert not v2
def _build_forward_features(self, all_features, parent_entity, child_entity, relationship, max_depth=0): if max_depth is not None and max_depth < 0: return features = self._features_by_type( all_features=all_features, entity=parent_entity, variable_type=[Numeric, Categorical, Ordinal], max_depth=max_depth) for f in features: if self._feature_in_relationship_path([relationship], f): continue # limits allowing direct features of agg_feats with where clauses if isinstance(f, AggregationFeature): deep_base_features = [f] + f.get_dependencies(deep=True) for feat in deep_base_features: if isinstance( feat, AggregationFeature) and feat.where is not None: continue new_f = DirectFeature(f, child_entity) self._handle_new_feature(all_features=all_features, new_feature=new_f)
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(entityset, backend): """ The graph looks like this (higher implies parent): C C = Customers, the entity we're trying to predict on | S = Sessions, a child of Customers P S L = Log, a child of both Sessions and Log \\ / P = Products, a parent of Log which is not a descendent of customers L We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and then aggregate it with another agg_feat of C on L. """ log_count_feat = ft.Feature(entityset['log']['id'], parent_entity=entityset['products'], primitive=Count) product_purchases_feat = DirectFeature(log_count_feat, child_entity=entityset['log']) purchase_popularity = ft.Feature(product_purchases_feat, parent_entity=entityset['customers'], primitive=Mean) pandas_backend = backend([purchase_popularity]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[purchase_popularity.get_name()][0] assert (v == 38.0 / 10.0)