def test_makes_direct_features_along_multiple_paths(diamond_es): dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions', entityset=diamond_es, max_depth=3, agg_primitives=[], trans_primitives=[]) features = dfs_obj.build_features() assert feature_with_name(features, 'customers.regions.name') assert feature_with_name(features, 'stores.regions.name')
def test_does_not_make_trans_of_single_direct_feature(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[], trans_primitives=['weekday'], max_depth=2) features = dfs_obj.build_features() assert not feature_with_name(features, 'WEEKDAY(customers.signup_date)') assert feature_with_name(features, 'customers.WEEKDAY(signup_date)')
def test_makes_numtrue(es): if es.dataframe_type == Library.SPARK.value: pytest.xfail("Spark EntitySets do not support NumTrue primitive") dfs = DeepFeatureSynthesis( target_dataframe_name="sessions", entityset=es, agg_primitives=[NumTrue], trans_primitives=[], ) features = dfs.build_features() assert feature_with_name(features, "customers.NUM_TRUE(log.purchased)") assert feature_with_name(features, "NUM_TRUE(log.purchased)")
def test_does_not_make_agg_of_direct_of_target_entity(es): count_sessions = ft.Feature(es['sessions']["id"], parent_entity=es['customers'], primitive=Count) dfs_obj = DeepFeatureSynthesis(target_entity_id='customers', entityset=es, agg_primitives=[Last], trans_primitives=[], max_depth=2, seed_features=[count_sessions]) features = dfs_obj.build_features() # this feature is meaningless because customers.COUNT(sessions) is already defined on # the customers entity assert not feature_with_name(features, 'LAST(sessions.customers.COUNT(sessions))') assert not feature_with_name(features, 'LAST(sessions.customers.age)')
def test_makes_agg_features_with_where(es): es.add_interesting_values() dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Count], where_primitives=[Count], trans_primitives=[]) features = dfs_obj.build_features() assert (feature_with_name(features, 'COUNT(log WHERE priority_level = 0)')) # make sure they are made using direct features too assert (feature_with_name(features, 'COUNT(log WHERE products.department = food)'))
def test_primitive_options_groupbys(es): options = {'cum_sum': {'include_groupby_variables': {'customers': [u'région_id']}, 'ignore_groupby_variables': {'sessions': ['customer_id']}}, 'cum_mean': {'ignore_groupby_variables': {'customers': [u'région_id', 'id']}}, 'cum_count': {'include_entities': ['customers'], 'include_groupby_variables': {'customers': [u"région_id", "cohort"]}}, 'cum_min': {'ignore_entities': ['customers']}, 'cum_max': {'include_entities': ['cohorts']}} dfs_obj = DeepFeatureSynthesis(target_entity_id='customers', entityset=es, groupby_trans_primitives=['cum_sum', 'cum_count', 'cum_min', 'cum_max', 'cum_mean'], primitive_options=options) features = dfs_obj.build_features() assert feature_with_name(features, u'CUM_SUM(age) by région_id') for f in features: # These either have nothing to groupby or don't include the target entity so shouldn't create features assert f.primitive.name not in ['cum_min', 'cum_max', 'cum_max'] if isinstance(f.primitive, CumMean): assert f.groupby.variable.id not in [u'région_id', 'id'] if isinstance(f.primitive, CumCount): assert f.groupby.variable.id in [u'région_id', 'cohort'] if isinstance(f.primitive, CumSum): deps = f.get_dependencies() entities = [d.entity.id for d in deps] if 'customers' in entities: assert f.groupby.variable.id == u'région_id'
def test_dfeats_where(es): es.add_interesting_values() dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Count], trans_primitives=[]) features = dfs_obj.build_features() # test to make sure we build direct features of agg features with where clause assert (feature_with_name( features, 'customers.COUNT(log WHERE priority_level = 0)')) assert (feature_with_name( features, 'COUNT(log WHERE products.department = electronics)'))
def test_make_groupby_features_with_id(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=['cum_count']) features = dfs_obj.build_features() assert (feature_with_name(features, "CUM_COUNT(customer_id) by customer_id"))
def test_makes_dfeatures_of_agg_primitives(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last], trans_primitives=[]) features = dfs_obj.build_features() assert (feature_with_name(features, 'customers.LAST(sessions.device_type)'))
def test_initialized_agg_prim(es): ThreeMost = NMostCommon(n=3) dfs_obj = DeepFeatureSynthesis(target_entity_id="sessions", entityset=es, agg_primitives=[ThreeMost], trans_primitives=[]) features = dfs_obj.build_features() assert (feature_with_name(features, "N_MOST_COMMON(log.product_id)"))
def test_makes_trans_feat(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='log', entityset=es, agg_primitives=[], trans_primitives=[Hour]) features = dfs_obj.build_features() assert (feature_with_name(features, 'HOUR(datetime)'))
def test_handles_time_since_previous_entity_groupby(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='log', entityset=es, agg_primitives=[], groupby_trans_primitives=[TimeSincePrevious]) features = dfs_obj.build_features() assert (feature_with_name(features, 'TIME_SINCE_PREVIOUS(datetime) by session_id'))
def test_makes_agg_features(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last], trans_primitives=[]) features = dfs_obj.build_features() assert (feature_with_name(features, 'LAST(log.value)'))
def test_transform_no_stack_agg(es): feature_defs = ft.dfs(entityset=es, target_entity="customers", agg_primitives=[NMostCommon], trans_primitives=[NotEqual], max_depth=3, features_only=True) assert not feature_with_name(feature_defs, 'id != N_MOST_COMMON(sessions.device_type)')
def test_make_groupby_features(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='log', entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=['cum_sum']) features = dfs_obj.build_features() assert (feature_with_name(features, "CUM_SUM(value) by session_id"))
def test_make_groupby_features_with_agg(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='cohorts', entityset=es, agg_primitives=['sum'], trans_primitives=[], groupby_trans_primitives=['cum_sum']) features = dfs_obj.build_features() agg_on_groupby_name = u"SUM(customers.CUM_SUM(age) by région_id)" assert (feature_with_name(features, agg_on_groupby_name))
def test_intialized_trans_prim(es): prim = IsIn(list_of_outputs=['coke zero']) dfs_obj = DeepFeatureSynthesis(target_entity_id='log', entityset=es, agg_primitives=[], trans_primitives=[prim]) features = dfs_obj.build_features() assert (feature_with_name(features, "product_id.isin(['coke zero'])"))
def test_makes_direct_of_agg_of_trans_on_target(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='log', entityset=es, agg_primitives=['mean'], trans_primitives=[Absolute], max_depth=3) features = dfs_obj.build_features() assert feature_with_name(features, 'sessions.MEAN(log.ABSOLUTE(value))')
def test_make_groupby_features_with_diff_id(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='customers', entityset=es, agg_primitives=[], trans_primitives=[], groupby_trans_primitives=['cum_count']) features = dfs_obj.build_features() groupby_with_diff_id = u"CUM_COUNT(cohort) by région_id" assert (feature_with_name(features, groupby_with_diff_id))
def test_makes_trans_of_multiple_direct_features(diamond_es): es = diamond_es dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions', entityset=es, agg_primitives=['mean'], trans_primitives=[Equal], max_depth=4) features = dfs_obj.build_features() # Make trans of direct and non-direct assert feature_with_name(features, 'amount = stores.MEAN(transactions.amount)') # Make trans of direct features on different entities assert feature_with_name(features, 'customers.MEAN(transactions.amount) = stores.square_ft') # Make trans of direct features on same entity with different paths. assert feature_with_name(features, 'customers.regions.name = stores.regions.name') # Don't make trans of direct features with same path. assert not feature_with_name(features, 'stores.square_ft = stores.MEAN(transactions.amount)') assert not feature_with_name(features, 'stores.MEAN(transactions.amount) = stores.square_ft') # The naming of the below is confusing but this is a direct feature of a transform. assert feature_with_name(features, 'stores.MEAN(transactions.amount) = square_ft')
def test_makes_count(es): dfs = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Count], trans_primitives=[]) features = dfs.build_features() assert feature_with_name(features, 'device_type') assert feature_with_name(features, 'customer_id') assert feature_with_name(features, u'customers.région_id') assert feature_with_name(features, 'customers.age') assert feature_with_name(features, 'COUNT(log)') assert feature_with_name(features, 'customers.COUNT(sessions)') assert feature_with_name(features, u'customers.régions.language') assert feature_with_name(features, 'customers.COUNT(log)')
def test_makes_count(es): dfs = DeepFeatureSynthesis( target_dataframe_name="sessions", entityset=es, agg_primitives=[Count], trans_primitives=[], ) features = dfs.build_features() assert feature_with_name(features, "device_type") assert feature_with_name(features, "customer_id") assert feature_with_name(features, "customers.région_id") assert feature_with_name(features, "customers.age") assert feature_with_name(features, "COUNT(log)") assert feature_with_name(features, "customers.COUNT(sessions)") assert feature_with_name(features, "customers.régions.language") assert feature_with_name(features, "customers.COUNT(log)")
def test_seed_multi_output_feature_stacking(es): threecommon = NMostCommon(3) tc = ft.Feature(es['log']['product_id'], parent_entity=es["sessions"], primitive=threecommon) fm, feat = ft.dfs(entityset=es, target_entity="customers", seed_features=[tc], agg_primitives=[NumUnique], trans_primitives=[], max_depth=4 ) for i in range(3): f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i assert feature_with_name(feat, f)
def test_makes_direct_features_through_multiple_relationships(games_es): dfs_obj = DeepFeatureSynthesis(target_entity_id='games', entityset=games_es, agg_primitives=['mean'], trans_primitives=[]) features = dfs_obj.build_features() teams = ['home', 'away'] for forward in teams: for backward in teams: for var in teams: f = 'teams[%s_team_id].MEAN(games[%s_team_id].%s_team_score)' \ % (forward, backward, var) assert feature_with_name(features, f)
def test_make_transform_multiple_output_features(pd_es): def test_time(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] def gen_feat_names(self): subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"] return [ "Now.%s(%s)" % (subname, self.base_features[0].get_name()) for subname in subnames ] TestTime = make_trans_primitive( function=test_time, input_types=[ColumnSchema(logical_type=Datetime)], return_type=ColumnSchema(semantic_tags={'numeric'}), number_output_features=6, cls_attributes={"get_feature_names": gen_feat_names}, ) join_time_split = ft.Feature(pd_es["log"].ww["datetime"], primitive=TestTime) alt_features = [ ft.Feature(pd_es["log"].ww["datetime"], primitive=Year), ft.Feature(pd_es["log"].ww["datetime"], primitive=Month), ft.Feature(pd_es["log"].ww["datetime"], primitive=Day), ft.Feature(pd_es["log"].ww["datetime"], primitive=Hour), ft.Feature(pd_es["log"].ww["datetime"], primitive=Minute), ft.Feature(pd_es["log"].ww["datetime"], primitive=Second) ] fm, fl = ft.dfs(entityset=pd_es, target_dataframe_name="log", agg_primitives=['sum'], trans_primitives=[ TestTime, Year, Month, Day, Hour, Minute, Second, Diff ], max_depth=5) subnames = join_time_split.get_feature_names() altnames = [f.get_name() for f in alt_features] for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all() for i in range(6): f = 'sessions.customers.SUM(log.TEST_TIME(datetime)[%d])' % i assert feature_with_name(fl, f) assert ('products.DIFF(SUM(log.TEST_TIME(datetime)[%d]))' % i) in fl
def test_groupby_multi_output_stacking(pd_es): class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 6 fl = dfs( entityset=pd_es, target_dataframe_name="sessions", agg_primitives=["sum"], groupby_trans_primitives=[TestTime], features_only=True, max_depth=4, ) for i in range(6): f = "SUM(log.TEST_TIME(datetime)[%d] by product_id)" % i assert feature_with_name(fl, f) assert ("customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)" % i) in fl
def test_groupby_multi_output_stacking(pd_es): TestTime = make_trans_primitive( function=lambda x: x, name="test_time", input_types=[Datetime], return_type=Numeric, number_output_features=6, ) fl = dfs(entityset=pd_es, target_entity="sessions", agg_primitives=['sum'], groupby_trans_primitives=[TestTime], features_only=True, max_depth=4) for i in range(6): f = 'SUM(log.TEST_TIME(datetime)[%d] by product_id)' % i assert feature_with_name(fl, f) assert ('customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)' % i) in fl
def test_groupby_multi_output_stacking(es): TestTime = make_trans_primitive( function=lambda x: x, name="test_time", input_types=[Datetime], return_type=Numeric, number_output_features=6, ) fl = dfs( entityset=es, target_entity="sessions", agg_primitives=[], trans_primitives=[TestTime], groupby_trans_primitives=[CumSum], features_only=True, max_depth=4) for i in range(6): f = 'customers.CUM_SUM(TEST_TIME(upgrade_date)[%d]) by cohort' % i assert feature_with_name(fl, f) assert ('customers.CUM_SUM(TEST_TIME(date_of_birth)[%d]) by customer_id' % i) in fl
def test_transform_consistency(): # Create dataframe df = pd.DataFrame({'a': [14, 12, 10], 'b': [False, False, True], 'b1': [True, True, False], 'b12': [4, 5, 6], 'P': [10, 15, 12]}) es = ft.EntitySet(id='test') # Add dataframe to entityset es.entity_from_dataframe(entity_id='first', dataframe=df, index='index', make_index=True) # Generate features feature_defs = ft.dfs(entityset=es, target_entity='first', trans_primitives=['and', 'add_numeric', 'or'], features_only=True) # Check for correct ordering of features assert feature_with_name(feature_defs, 'a') assert feature_with_name(feature_defs, 'b') assert feature_with_name(feature_defs, 'b1') assert feature_with_name(feature_defs, 'b12') assert feature_with_name(feature_defs, 'P') assert feature_with_name(feature_defs, 'AND(b, b1)') assert not feature_with_name(feature_defs, 'AND(b1, b)') # make sure it doesn't exist the other way assert feature_with_name(feature_defs, 'a + P') assert feature_with_name(feature_defs, 'b12 + P') assert feature_with_name(feature_defs, 'a + b12') assert feature_with_name(feature_defs, 'OR(b, b1)') assert feature_with_name(feature_defs, 'OR(AND(b, b1), b)') assert feature_with_name(feature_defs, 'OR(AND(b, b1), b1)')