def test_diff_single_value(es): diff = ft.Feature(es['stores']['num_square_feet'], groupby=es['stores'][u'région_id'], primitive=Diff) feature_set = FeatureSet([diff]) calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run([5]) assert df.shape[0] == 1 assert df[diff.get_name()].dropna().shape[0] == 0
def test_calls_progress_callback(es): # call with all feature types. make sure progress callback calls sum to 1 identity = ft.Feature(es['customers']['age']) direct = ft.Feature(es['cohorts']['cohort_name'], es['customers']) agg = ft.Feature(es["sessions"]["id"], parent_entity=es['customers'], primitive=Count) agg_apply = ft.Feature(es["log"]["datetime"], parent_entity=es['customers'], primitive=TimeSinceLast) # this feature is handle differently than simple features trans = ft.Feature(agg, primitive=Negate) trans_full = ft.Feature(agg, primitive=CumSum) groupby_trans = ft.Feature(agg, primitive=CumSum, groupby=es["customers"]["cohort"]) if any(isinstance(entity.df, dd.DataFrame) for entity in es.entities): all_features = [identity, direct, agg, trans] else: all_features = [identity, direct, agg, agg_apply, trans, trans_full, groupby_trans] feature_set = FeatureSet(all_features) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) class MockProgressCallback: def __init__(self): self.total = 0 def __call__(self, update): self.total += update mock_progress_callback = MockProgressCallback() instance_ids = [0, 1, 2] calculator.run(np.array(instance_ids), mock_progress_callback) assert np.isclose(mock_progress_callback.total, 1) # testing again with a time_last with no data feature_set = FeatureSet(all_features) calculator = FeatureSetCalculator(es, time_last=pd.Timestamp("1950"), feature_set=feature_set) mock_progress_callback = MockProgressCallback() calculator.run(np.array(instance_ids), mock_progress_callback) assert np.isclose(mock_progress_callback.total, 1)
def test_percentile(es): v = ft.Feature(es['log']['value']) p = ft.Feature(v, primitive=Percentile) feature_set = FeatureSet([p]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array(range(10, 17))) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_squared(es): feature = IdentityFeature(es['log']['value']) squared = feature * feature feature_set = FeatureSet([feature, squared]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0, 1, 2]))) for i, row in df.iterrows(): assert (row[0] * row[0]) == row[1]
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index='id', sort_index=True) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_make_agg_feat_of_grandchild_entity(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_make_identity(es): f = IdentityFeature(es['log']['datetime']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_diff_reordered(es): sum_feat = ft.Feature(es['log']['value'], parent_entity=es["sessions"], primitive=Sum) diff = ft.Feature(sum_feat, primitive=Diff) feature_set = FeatureSet([diff]) calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([4, 2])) assert df[diff.get_name()][4] == 16 assert df[diff.get_name()][2] == -6
def test_make_agg_feat_of_identity_index_variable(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 5)
def check(feature): feature_set = FeatureSet([feature]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df_1 = calculator.run(np.array([0, 1, 2])) df_2 = calculator.run(np.array([2, 4])) # check that the value for instance id 2 matches assert (df_2.loc[2] == df_1.loc[2]).all()
def test_make_dfeat(es): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert (v == 33)
def test_make_trans_feat(es): f = ft.Feature(es['log']['datetime'], primitive=Hour) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute() v = df[f.get_name()][0] assert v == 10
def test_with_features_built_from_es_metadata(es): metadata = es.metadata agg_feat = ft.Feature(metadata['log']['id'], parent_entity=metadata['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_make_identity(es): f = IdentityFeature(es["log"].ww["datetime"]) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert v == datetime(2011, 4, 9, 10, 30, 0)
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run([0, 5]) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_agg_empty_child(es): customer_count_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([customer_count_feat]) # time last before the customer had any events, so child frame is empty calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = calculator.run(np.array([0])) assert df["COUNT(log)"].iloc[0] == 0
def test_make_agg_feat_of_agg_feat(es): log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) customer_sum_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Sum) feature_set = FeatureSet([customer_sum_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[customer_sum_feat.get_name()][0] assert (v == 10)
def test_make_dfeat(es): f = DirectFeature(ft.Feature(es['customers'].ww['age']), child_dataframe_name='sessions') feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert (v == 33)
def test_agg_percentile(es): v = ft.Feature(es['log']['value']) p = ft.Feature(v, primitive=Percentile) agg = ft.Feature(p, parent_entity=es['sessions'], primitive=Sum) feature_set = FeatureSet([agg]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_squared(es): feature = IdentityFeature(es['log']['value']) squared = feature * feature feature_set = FeatureSet([feature, squared]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) if isinstance(df, dd.DataFrame): df = df.compute() for i, row in df.iterrows(): assert (row[0] * row[0]) == row[1]
def test_make_3_stacked_agg_feats(df): """ Tests stacking 3 agg features. The test specifically uses non numeric indices to test how ancestor columns are handled as dataframes are merged together """ if isinstance(df, dd.DataFrame): pytest.xfail("normalize_datdataframe fails with dask DataFrame") es = ft.EntitySet() ltypes = { "e1": Categorical, "e2": Categorical, "e3": Categorical, "val": Double } es.add_dataframe(dataframe=df, index="id", dataframe_name="e0", logical_types=ltypes) es.normalize_dataframe( base_dataframe_name="e0", new_dataframe_name="e1", index="e1", additional_columns=["e2", "e3"], ) es.normalize_dataframe( base_dataframe_name="e1", new_dataframe_name="e2", index="e2", additional_columns=["e3"], ) es.normalize_dataframe(base_dataframe_name="e2", new_dataframe_name="e3", index="e3") sum_1 = ft.Feature(es["e0"].ww["val"], parent_dataframe_name="e1", primitive=Sum) sum_2 = ft.Feature(sum_1, parent_dataframe_name="e2", primitive=Sum) sum_3 = ft.Feature(sum_2, parent_dataframe_name="e3", primitive=Sum) feature_set = FeatureSet([sum_3]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array(["z"])) v = df[sum_3.get_name()][0] assert v == 5
def test_returns_order_of_instance_ids(pd_es): feature_set = FeatureSet([ft.Feature(pd_es['customers']['age'])]) calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) instance_ids = [0, 1, 2] assert list(pd_es['customers'].df['id']) != instance_ids df = calculator.run(np.array(instance_ids)) assert list(df.index) == instance_ids
def test_feature_trie_without_needs_full_entity(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es['countries']['name']) direct_name = ft.DirectFeature(country_name, es['regions']) amount = ft.IdentityFeature(es['transactions']['amount']) path_through_customers = backward_path( es, ['regions', 'customers', 'transactions']) through_customers = ft.AggregationFeature( amount, es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_customers) path_through_stores = backward_path(es, ['regions', 'stores', 'transactions']) through_stores = ft.AggregationFeature( amount, es['regions'], primitive=ft.primitives.Mean, relationship_path=path_through_stores) customers_to_transactions = backward_path(es, ['customers', 'transactions']) customers_mean = ft.AggregationFeature( amount, es['customers'], primitive=ft.primitives.Mean, relationship_path=customers_to_transactions) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ['regions', 'customers']) mean_of_mean = ft.AggregationFeature( negation, es['regions'], primitive=ft.primitives.Mean, relationship_path=regions_to_customers) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == \ (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == \ (False, set(), {country_name.unique_name()}) assert trie.get_node(regions_to_customers).value == \ (False, set(), {negation.unique_name(), customers_mean.unique_name()}) regions_to_stores = backward_path(es, ['regions', 'stores']) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == \ (False, set(), {amount.unique_name()}) assert trie.get_node(path_through_stores).value == \ (False, set(), {amount.unique_name()})
def test_direct_percentile(es): v = ft.Feature(es['customers']['age']) p = ft.Feature(v, primitive=Percentile) d = ft.Feature(p, es['sessions']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) cust_vals = es['customers'].df[[v.get_name()]] cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals['percentile'].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_make_agg_feat_of_identity_column(es): agg_feat = ft.Feature(es['log'].ww['value'], parent_dataframe_name='sessions', primitive=Sum) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 50)
def test_full_entity_error_dask(dask_es): agg_feat = ft.Feature(dask_es['log']['value'], parent_entity=dask_es['customers'], primitive=Sum) trans_feat = ft.Feature(agg_feat, primitive=CumSum) feature_set = FeatureSet([trans_feat]) calculator = FeatureSetCalculator(dask_es, time_last=None, feature_set=feature_set) error_text = "Cannot use primitives that require full entity with Dask" with pytest.raises(ValueError, match=error_text): calculator.run(np.array([1]))
def test_make_agg_feat_of_grandchild_entity(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute().set_index('id') df.index = pd.Int64Index(df.index) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_make_agg_feat_of_identity_variable(es): agg_feat = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Sum) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute() v = df[agg_feat.get_name()][0] assert (v == 50)
def test_full_entity_trans_of_agg(es): agg_feat = ft.Feature(es['log']['value'], parent_entity=es['customers'], primitive=Sum) trans_feat = ft.Feature(agg_feat, primitive=CumSum) feature_set = FeatureSet([trans_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([1])) v = df[trans_feat.get_name()][1] assert v == 82
def test_two_kinds_of_dependents(es): v = ft.Feature(es['log']['value']) product = ft.Feature(es['log']['product_id']) agg = ft.Feature(v, parent_entity=es['customers'], where=product == 'coke zero', primitive=Sum) p = ft.Feature(agg, primitive=Percentile) g = ft.Feature(agg, primitive=Absolute) agg2 = ft.Feature(v, parent_entity=es['sessions'], where=product == 'coke zero', primitive=Sum) agg3 = ft.Feature(agg2, parent_entity=es['customers'], primitive=Sum) feature_set = FeatureSet([p, g, agg3]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) assert df[p.get_name()].tolist() == [2. / 3, 1.0] assert df[g.get_name()].tolist() == [15, 26]