def test_max_hlevel(es): kwargs = dict( target_entity_id='log', entityset=es, agg_primitives=[Count, Last], trans_primitives=[Hour], max_depth=-1, ) dfs_h_n1 = DeepFeatureSynthesis(max_hlevel=-1, **kwargs) dfs_h_0 = DeepFeatureSynthesis(max_hlevel=0, **kwargs) dfs_h_1 = DeepFeatureSynthesis(max_hlevel=1, **kwargs) feats_n1 = dfs_h_n1.build_features() feats_n1 = [f.get_name() for f in feats_n1] feats_0 = dfs_h_0.build_features() feats_0 = [f.get_name() for f in feats_0] feats_1 = dfs_h_1.build_features() feats_1 = [f.get_name() for f in feats_1] customer_log = Last(es['log']['value'], es['customers']) session_log = Last(es['log']['value'], es['sessions']) log_customer_log = Feature(customer_log, es['log']) log_session_log = Feature(session_log, es['log']) assert log_customer_log.get_name() in feats_n1 assert log_session_log.get_name() in feats_n1 assert log_customer_log.get_name() not in feats_1 assert log_session_log.get_name() in feats_1 assert log_customer_log.get_name() not in feats_0 assert log_session_log.get_name() not in feats_0
def test_allowed_paths(es): kwargs = dict( target_entity_id='customers', entityset=es, agg_primitives=[Last], trans_primitives=[], max_depth=2, seed_features=[] ) dfs_unconstrained = DeepFeatureSynthesis(**kwargs) features_unconstrained = dfs_unconstrained.build_features() unconstrained_names = [f.get_name() for f in features_unconstrained] customers_session_feat = Last(es['sessions']['device_type'], es['customers']) customers_session_log_feat = Last(es['log']['value'], es['customers']) assert customers_session_feat.get_name() in unconstrained_names assert customers_session_log_feat.get_name() in unconstrained_names dfs_constrained = DeepFeatureSynthesis(allowed_paths=[['customers', 'sessions']], **kwargs) features = dfs_constrained.build_features() names = [f.get_name() for f in features] assert customers_session_feat.get_name() in names assert customers_session_log_feat.get_name() not in names
def test_seed_features(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last], trans_primitives=[], max_depth=2, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features]
def test_seed_features(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last], trans_primitives=[], max_depth=2, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features]
def test_get_depth(es): es = make_ecommerce_entityset() f = Feature(es['log']['value']) g = Feature(es['log']['value']) agg1 = Last(f, es['sessions']) agg2 = Last(agg1, es['customers']) d1 = Feature(agg2, es['sessions']) d2 = Feature(d1, es['log']) assert d2.get_depth() == 4 # Make sure this works if we pass in two of the same # feature. This came up when user supplied duplicates # in the seed_features of DFS. assert d2.get_depth(stop_at=[f, g]) == 4 assert d2.get_depth(stop_at=[f, g, agg1]) == 3 assert d2.get_depth(stop_at=[f, g, agg1]) == 3 assert d2.get_depth(stop_at=[f, g, agg2]) == 2 assert d2.get_depth(stop_at=[f, g, d1]) == 1 assert d2.get_depth(stop_at=[f, g, d2]) == 0
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Count(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Mode(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_return_type_inference_numeric_time_index(es_numeric): last = Last(es_numeric["log"]["datetime"], es_numeric["customers"]) assert last.variable_type == Numeric
def test_return_type_inference_datetime_time_index(es): last = Last(es["log"]["datetime"], es["customers"]) assert last.variable_type == Datetime