def test_direct(es): d1 = DirectFeature(IdentityFeature(es['customers'].ww['engagement_level']), 'sessions') d2 = DirectFeature(d1, 'log') graph = graph_feature(d2).source d1_name = d1.get_name() d2_name = d2.get_name() prim_node1 = '1_{}_join'.format(d1_name) prim_node2 = '0_{}_join'.format(d2_name) log_table = '\u2605 log (target)' sessions_table = 'sessions' customers_table = 'customers' groupby_edge1 = '"{}" -> sessions:customer_id'.format(prim_node1) groupby_edge2 = '"{}" -> log:session_id'.format(prim_node2) groupby_input1 = 'customers:engagement_level -> "{}"'.format(prim_node1) groupby_input2 = 'sessions:"{}" -> "{}"'.format(d1_name, prim_node2) d1_edge = '"{}" -> sessions:"{}"'.format(prim_node1, d1_name) d2_edge = '"{}" -> log:"{}"'.format(prim_node2, d2_name) graph_components = [ d1_name, d2_name, prim_node1, prim_node2, log_table, sessions_table, customers_table, groupby_edge1, groupby_edge2, groupby_input1, groupby_input2, d1_edge, d2_edge ] for component in graph_components: assert component in graph dataframes = { 'customers': [customers_table, 'engagement_level'], 'sessions': [sessions_table, 'customer_id', d1_name], 'log': [log_table, 'session_id', d2_name] } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_make_dfeat_of_agg_feat_on_self(es): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the dataframe we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_dataframe_name="customers") feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id") v = df[num_customers_feat.get_name()].values[0] assert v == 3
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = ft.Feature(es['log']["id"], parent_entity=es['sessions'], primitive=Count) seed_feature_log = ft.Feature(es['log']['datetime'], primitive=Hour) session_agg = ft.Feature(seed_feature_log, parent_entity=es['sessions'], primitive=Last) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature( ft.Feature(session_agg, parent_entity=es['customers'], primitive=Mode), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_make_dfeat_of_agg_feat_through_parent(es, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(es['stores']['id']) store_count_feat = ft.Feature(store_id_feat, parent_entity=es[u'régions'], primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_entity=es['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_make_dfeat_of_agg_feat_on_self(es): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es['customers']['id'], parent_entity=es[u'régions'], primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_entity=es['customers']) feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_make_dfeat_of_agg_feat_through_parent(es): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(es['stores']['id']) store_count_feat = ft.Feature(store_id_feat, parent_entity=es[u'régions'], primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_entity=es['customers']) feature_set = FeatureSet([num_stores_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index='id') v = df[num_stores_feat.get_name()].values[0] assert (v == 3)
def test_make_dfeat(es, backend): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) pandas_backend = backend([f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert (v == 33)
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) copy_feat = feat.rename("session_test") assert feat.hash() != copy_feat.hash() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run([0, 5]) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_rename_multioutput(es): n_common = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, es['sessions']) copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_make_dfeat(es): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert (v == 33)
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=IdentityFeature( es['sessions'].ww['device_type']), child_dataframe_name='log') copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name( ) == copy_feat.base_features[0].generate_name() assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_rename_multioutput(es): n_common = Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, 'sessions') copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name( ) == copy_feat.base_features[0].generate_name() assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_make_dfeat(es): f = DirectFeature(ft.Feature(es['customers'].ww['age']), child_dataframe_name='sessions') feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert (v == 33)
def test_groupby_transform_direct_groupby(es): groupby = DirectFeature(IdentityFeature(es['cohorts'].ww['cohort_name']), 'customers') feat = GroupByTransformFeature(IdentityFeature(es['customers'].ww['age']), CumMax, groupby) graph = graph_feature(feat).source groupby_name = groupby.get_name() feat_name = feat.get_name() join_node = '1_{}_join'.format(groupby_name) prim_node = "0_{}_cum_max".format(feat_name) groupby_node = '{}_groupby_customers--{}'.format(feat_name, groupby_name) customers_table = '\u2605 customers (target)' cohorts_table = 'cohorts' join_groupby = '"{}" -> customers:cohort'.format(join_node) join_input = 'cohorts:cohort_name -> "{}"'.format(join_node) join_out_edge = '"{}" -> customers:"{}"'.format(join_node, groupby_name) groupby_edge = 'customers:"{}" -> "{}"'.format(groupby_name, groupby_node) groupby_input = 'customers:age -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name) graph_components = [ groupby_name, feat_name, join_node, prim_node, groupby_node, customers_table, cohorts_table, join_groupby, join_input, join_out_edge, groupby_edge, groupby_input, prim_input, feat_edge ] for component in graph_components: assert component in graph dataframes = { 'cohorts': [cohorts_table, 'cohort_name'], 'customers': [customers_table, 'cohort', 'age', groupby_name, feat_name] } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_direct_rename_multioutput(es): n_common = Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) feat = DirectFeature(n_common, "sessions") copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert (feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()) assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) if isinstance(df, dd.DataFrame): df = df.compute().set_index('id').sort_index() v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_with_multiple_possible_paths(games_es): error_text = "There are multiple relationships to the base dataframe. " \ "You must specify a relationship." with pytest.raises(RuntimeError, match=error_text): DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games') # Does not raise if path specified. relationship = next(r for r in games_es.get_forward_relationships('games') if r._child_column_name == 'home_team_id') feat = DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games', relationship=relationship) assert feat.relationship_path_name() == 'teams[home_team_id]' assert feat.get_name() == 'teams[home_team_id].name'
def test_serialization(es): value = ft.IdentityFeature(es["products"].ww["rating"]) direct = DirectFeature(value, "log") log_to_products = next(r for r in es.get_forward_relationships("log") if r.parent_dataframe.ww.name == "products") dictionary = { "name": direct.get_name(), "base_feature": value.unique_name(), "relationship": log_to_products.to_dictionary(), } assert dictionary == direct.get_arguments() assert direct == DirectFeature.from_dictionary( dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
def test_direct_with_multiple_possible_paths(games_es): error_text = ("There are multiple relationships to the base dataframe. " "You must specify a relationship.") with pytest.raises(RuntimeError, match=error_text): DirectFeature(IdentityFeature(games_es["teams"].ww["name"]), "games") # Does not raise if path specified. relationship = next(r for r in games_es.get_forward_relationships("games") if r._child_column_name == "home_team_id") feat = DirectFeature( IdentityFeature(games_es["teams"].ww["name"]), "games", relationship=relationship, ) assert feat.relationship_path_name() == "teams[home_team_id]" assert feat.get_name() == "teams[home_team_id].name"
def test_direct_from_identity(es): device = Feature(es["sessions"].ww["device_type"]) d = DirectFeature(base_feature=device, child_dataframe_name="log") feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index="id", sort_index=True) v = df[d.get_name()].tolist() if es.dataframe_type == Library.SPARK.value: expected = ["0", "1"] else: expected = [0, 1] assert v == expected
def test_direct_from_identity(es): device = Feature(es['sessions'].ww['device_type']) d = DirectFeature(base_feature=device, child_dataframe_name='log') feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index='id', sort_index=True) v = df[d.get_name()].tolist() if es.dataframe_type == Library.KOALAS.value: expected = ['0', '1'] else: expected = [0, 1] assert v == expected
def test_make_dfeat_of_agg_feat_on_self(entityset, backend): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(entityset['customers']['id'], parent_entity=entityset[u'régions'], primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_customers_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_direct_with_single_possible_path(es): feat = DirectFeature(IdentityFeature(es["customers"].ww["age"]), "sessions") assert feat.relationship_path_name() == "customers" assert feat.get_name() == "customers.age"
def test_direct_with_single_possible_path(es): feat = DirectFeature(IdentityFeature(es['customers'].ww['age']), 'sessions') assert feat.relationship_path_name() == 'customers' assert feat.get_name() == 'customers.age'