def test_make_identity(es): f = IdentityFeature(es["log"].ww["datetime"]) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert v == datetime(2011, 4, 9, 10, 30, 0)
def test_dependent_percentile(es): v = ft.Feature(es['log']['value']) p = ft.Feature(v, primitive=Percentile) p2 = ft.Feature(p - 1, primitive=Percentile) feature_set = FeatureSet([p, p2]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array(range(10, 17))) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_make_dfeat(es): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert (v == 33)
def test_make_agg_feat_using_prev_n_events(es): agg_feat_1 = ft.Feature(es['log']['value'], parent_entity=es['sessions'], use_previous=Timedelta(1, 'observations', entity=es['log']), primitive=Min) agg_feat_2 = ft.Feature(es['log']['value'], parent_entity=es['sessions'], use_previous=Timedelta(3, 'observations', entity=es['log']), primitive=Min) assert agg_feat_1.get_name() != agg_feat_2.get_name(), \ 'Features should have different names based on use_previous' feature_set = FeatureSet([agg_feat_1, agg_feat_2]) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 6), feature_set=feature_set) df = calculator.run(np.array([0])) # time_last is included by default v1 = df[agg_feat_1.get_name()][0] v2 = df[agg_feat_2.get_name()][0] assert v1 == 5 assert v2 == 0 calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = calculator.run(np.array([0])) v1 = df[agg_feat_1.get_name()][0] v2 = df[agg_feat_2.get_name()][0] assert v1 == 20 assert v2 == 10
def test_make_dfeat(es): f = DirectFeature(ft.Feature(es["customers"].ww["age"]), child_dataframe_name="sessions") feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[f.get_name()][0] assert v == 33
def test_make_3_stacked_agg_feats(df): """ Tests stacking 3 agg features. The test specifically uses non numeric indices to test how ancestor columns are handled as dataframes are merged together """ if isinstance(df, dd.DataFrame): pytest.xfail("normalize_datdataframe fails with dask DataFrame") es = ft.EntitySet() ltypes = { "e1": Categorical, "e2": Categorical, "e3": Categorical, "val": Double } es.add_dataframe(dataframe=df, index="id", dataframe_name="e0", logical_types=ltypes) es.normalize_dataframe( base_dataframe_name="e0", new_dataframe_name="e1", index="e1", additional_columns=["e2", "e3"], ) es.normalize_dataframe( base_dataframe_name="e1", new_dataframe_name="e2", index="e2", additional_columns=["e3"], ) es.normalize_dataframe(base_dataframe_name="e2", new_dataframe_name="e3", index="e3") sum_1 = ft.Feature(es["e0"].ww["val"], parent_dataframe_name="e1", primitive=Sum) sum_2 = ft.Feature(sum_1, parent_dataframe_name="e2", primitive=Sum) sum_3 = ft.Feature(sum_2, parent_dataframe_name="e3", primitive=Sum) feature_set = FeatureSet([sum_3]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array(["z"])) v = df[sum_3.get_name()][0] assert v == 5
def test_returns_order_of_instance_ids(pd_es): feature_set = FeatureSet([ft.Feature(pd_es['customers']['age'])]) calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) instance_ids = [0, 1, 2] assert list(pd_es['customers'].df['id']) != instance_ids df = calculator.run(np.array(instance_ids)) assert list(df.index) == instance_ids
def test_make_agg_feat_of_identity_index_variable(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 5)
def test_make_agg_feat_of_agg_feat(es): log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) customer_sum_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Sum) feature_set = FeatureSet([customer_sum_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[customer_sum_feat.get_name()][0] assert (v == 10)
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index='id', sort_index=True) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_direct_squared(es): feature = IdentityFeature(es['log']['value']) squared = feature * feature feature_set = FeatureSet([feature, squared]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) if isinstance(df, dd.DataFrame): df = df.compute() for i, row in df.iterrows(): assert (row[0] * row[0]) == row[1]
def test_make_agg_feat_of_grandchild_entity(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_agg_percentile(es): v = ft.Feature(es['log']['value']) p = ft.Feature(v, primitive=Percentile) agg = ft.Feature(p, parent_entity=es['sessions'], primitive=Sum) feature_set = FeatureSet([agg]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) if isinstance(df, dd.DataFrame): df = df.compute().set_index('id').sort_index() v = df[d.get_name()].tolist() assert v == [0, 1]
def calc_results(time_last, ids, precalculated_features=None, training_window=None): calculator = FeatureSetCalculator( entityset, feature_set, time_last, training_window=training_window, precalculated_features=precalculated_features) matrix = calculator.run(ids) return matrix
def test_full_entity_trans_of_agg(es): agg_feat = ft.Feature(es['log']['value'], parent_entity=es['customers'], primitive=Sum) trans_feat = ft.Feature(agg_feat, primitive=CumSum) feature_set = FeatureSet([trans_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([1])) v = df[trans_feat.get_name()][1] assert v == 82
def test_make_agg_feat_using_prev_time(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], use_previous=Timedelta(10, 's'), primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 9, 10, 30, 10), feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 2) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 1)
def test_make_agg_feat_of_identity_column(es): agg_feat = ft.Feature(es['log'].ww['value'], parent_dataframe_name='sessions', primitive=Sum) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 50)
def test_two_kinds_of_dependents(es): v = ft.Feature(es['log']['value']) product = ft.Feature(es['log']['product_id']) agg = ft.Feature(v, parent_entity=es['customers'], where=product == 'coke zero', primitive=Sum) p = ft.Feature(agg, primitive=Percentile) g = ft.Feature(agg, primitive=Absolute) agg2 = ft.Feature(v, parent_entity=es['sessions'], where=product == 'coke zero', primitive=Sum) agg3 = ft.Feature(agg2, parent_entity=es['customers'], primitive=Sum) feature_set = FeatureSet([p, g, agg3]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) assert df[p.get_name()].tolist() == [2. / 3, 1.0] assert df[g.get_name()].tolist() == [15, 26]
def test_make_agg_feat_using_prev_n_events(es): if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities): pytest.xfail('Distrubuted entitysets do not support use_previous') agg_feat_1 = ft.Feature(es['log']['value'], parent_entity=es['sessions'], use_previous=Timedelta(1, 'observations'), primitive=Min) agg_feat_2 = ft.Feature(es['log']['value'], parent_entity=es['sessions'], use_previous=Timedelta(3, 'observations'), primitive=Min) assert agg_feat_1.get_name() != agg_feat_2.get_name(), \ 'Features should have different names based on use_previous' feature_set = FeatureSet([agg_feat_1, agg_feat_2]) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 6), feature_set=feature_set) df = calculator.run(np.array([0])) # time_last is included by default v1 = df[agg_feat_1.get_name()][0] v2 = df[agg_feat_2.get_name()][0] assert v1 == 5 assert v2 == 0 calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = calculator.run(np.array([0])) v1 = df[agg_feat_1.get_name()][0] v2 = df[agg_feat_2.get_name()][0] assert v1 == 20 assert v2 == 10
def test_make_identity(es): f = IdentityFeature(es['log']['datetime']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute() v = df[f.get_name()][0] assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_make_agg_feat_of_grandchild_entity(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute().set_index('id') df.index = pd.Int64Index(df.index) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_full_entity_error_dask(dask_es): agg_feat = ft.Feature(dask_es['log']['value'], parent_entity=dask_es['customers'], primitive=Sum) trans_feat = ft.Feature(agg_feat, primitive=CumSum) feature_set = FeatureSet([trans_feat]) calculator = FeatureSetCalculator(dask_es, time_last=None, feature_set=feature_set) error_text = "Cannot use primitives that require full entity with Dask" with pytest.raises(ValueError, match=error_text): calculator.run(np.array([1]))
def test_with_features_built_from_es_metadata(es): metadata = es.metadata agg_feat = ft.Feature(metadata['log']['id'], parent_entity=metadata['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_direct_percentile(es): v = ft.Feature(es['customers']['age']) p = ft.Feature(v, primitive=Percentile) d = ft.Feature(p, es['sessions']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set) df = calculator.run(np.array([0, 1])) cust_vals = es['customers'].df[[v.get_name()]] cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals['percentile'].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_empty_child(es): customer_count_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([customer_count_feat]) # time last before the customer had any events, so child frame is empty calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute() assert df["COUNT(log)"].iloc[0] == 0
def test_make_agg_feat_of_grandchild_dataframe(es): agg_feat = ft.Feature(es['log'].ww['id'], parent_dataframe_name='customers', primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index='id') v = df[agg_feat.get_name()].values[0] assert (v == 10)
def test_agg_empty_child(es): customer_count_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([customer_count_feat]) # time last before the customer had any events, so child frame is empty calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = to_pandas(calculator.run(np.array([0])), index='id') assert df["COUNT(log)"].iloc[0] == 0
def test_make_agg_feat_of_identity_variable(es): agg_feat = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Sum) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute() v = df[agg_feat.get_name()][0] assert (v == 50)