def test_make_dfeat_of_agg_feat_through_parent(entityset, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(entityset['stores']['id']) store_count_feat = Count(store_id_feat, parent_entity=entityset[u'régions']) num_stores_feat = DirectFeature(store_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) copy_feat = feat.rename("session_test") assert feat.hash() != copy_feat.hash() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_make_dfeat(entityset, backend): f = DirectFeature(entityset['customers']['age'], child_entity=entityset['sessions']) pandas_backend = backend([f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert (v == 33)
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) assert d.variable == device pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def _build_forward_features(self, all_features, parent_entity, child_entity, relationship, max_depth=0): if max_depth is not None and max_depth < 0: return features = self._features_by_type(all_features=all_features, entity=parent_entity, variable_type=[Numeric, Categorical, Ordinal], max_depth=max_depth) for f in features: if self._feature_in_relationship_path([relationship], f): continue # limits allowing direct features of agg_feats with where clauses if isinstance(f, AggregationPrimitive): deep_base_features = [f] + f.get_deep_dependencies() for feat in deep_base_features: if isinstance(feat, AggregationPrimitive) and feat.where is not None: continue new_f = DirectFeature(f, child_entity) if f.expanding: continue else: self._handle_new_feature(all_features=all_features, new_feature=new_f)
def test_diff(es): value = IdentityFeature(es['log']['value']) customer_id_feat = \ DirectFeature(es['sessions']['customer_id'], child_entity=es['log']) diff1 = Diff(value, es['log']['session_id']) diff2 = Diff(value, customer_id_feat) pandas_backend = PandasBackend(es, [diff1, diff2]) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) val1 = df[diff1.get_name()].values.tolist() val2 = df[diff2.get_name()].values.tolist() correct_vals1 = [ np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7 ] correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7] for i, v in enumerate(val1): v1 = val1[i] if np.isnan(v1): assert (np.isnan(correct_vals1[i])) else: assert v1 == correct_vals1[i] v2 = val2[i] if np.isnan(v2): assert (np.isnan(correct_vals2[i])) else: assert v2 == correct_vals2[i]
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(entityset, backend): """ The graph looks like this (higher implies parent): C C = Customers, the entity we're trying to predict on | S = Sessions, a child of Customers P S L = Log, a child of both Sessions and Log \\ / P = Products, a parent of Log which is not a descendent of customers L We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and then aggregate it with another agg_feat of C on L. """ log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['products']) product_purchases_feat = DirectFeature(log_count_feat, child_entity=entityset['log']) purchase_popularity = Mean(product_purchases_feat, parent_entity=entityset['customers']) pandas_backend = backend([purchase_popularity]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[purchase_popularity.get_name()][0] assert (v == 38.0 / 10.0)
def test_make_compare_feat(entityset, backend): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ Count.max_stack_depth = 2 log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) mean_agg_feat = Mean(log_count_feat, parent_entity=entityset['customers']) mean_feat = DirectFeature(mean_agg_feat, child_entity=entityset['sessions']) feat = log_count_feat > mean_feat pandas_backend = backend([feat]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) name = feat.get_name() instances = df[name] v0, v1, v2 = instances[0:3] assert v0 assert v1 assert not v2
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Mode(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Mode(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_arithmetic_of_direct(es): rating = es['products']['rating'] log_rating = DirectFeature(rating, child_entity=es['log']) customer_age = es['customers']['age'] session_age = DirectFeature(customer_age, child_entity=es['sessions']) log_age = DirectFeature(session_age, child_entity=es['log']) to_test = [(Add, [38, 37, 37.5, 37.5]), (Subtract, [28, 29, 28.5, 28.5]), (Multiply, [165, 132, 148.5, 148.5]), (Divide, [6.6, 8.25, 22. / 3, 22. / 3])] features = [] for test in to_test: features.append(test[0](log_age, log_rating)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 3, 5, 7], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_make_dfeat_of_agg_feat_on_self(entityset, backend): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = Count(entityset['customers']['id'], parent_entity=entityset[u'régions']) num_customers_feat = DirectFeature(customer_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_customers_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_compare_of_direct(es): log_rating = DirectFeature(es['products']['rating'], child_entity=es['log']) to_test = [(Equals, [False, False, False, False]), (NotEquals, [True, True, True, True]), (LessThan, [False, False, False, True]), (LessThanEqualTo, [False, False, False, True]), (GreaterThan, [True, True, True, False]), (GreaterThanEqualTo, [True, True, True, False])] features = [] for test in to_test: features.append(test[0](log_rating, 4.5)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]