def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=cutoff_time) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_copy_features_does_not_copy_entityset(es): agg = Sum(es['log']['value'], es['sessions']) agg_where = Sum(es['log']['value'], es['sessions'], where=IdentityFeature(es['log']['value']) == 2) agg_use_previous = Sum(es['log']['value'], es['sessions'], use_previous='4 days') agg_use_previous_where = Sum(es['log']['value'], es['sessions'], where=IdentityFeature(es['log']['value']) == 2, use_previous='4 days') features = [agg, agg_where, agg_use_previous, agg_use_previous_where] in_memory_size = asizeof(locals()) copied = [f.copy() for f in features] new_in_memory_size = asizeof(locals()) assert new_in_memory_size < 2 * in_memory_size for f, c in zip(features, copied): assert f.entityset assert c.entityset assert id(f.entityset) == id(c.entityset) if f.where: assert c.where assert id(f.where.entityset) == id(c.where.entityset) for bf, bf_c in zip(f.base_features, c.base_features): assert id(bf.entityset) == id(bf_c.entityset) if bf.where: assert bf_c.where assert id(bf.where.entityset) == id(bf_c.where.entityset)
def test_uses_full_entity_feat_of_approximate(entityset): es = entityset agg_feat = Sum(es['log']['value'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) agg_feat3 = Min(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) dfeat2 = DirectFeature(agg_feat3, es['sessions']) p = Percentile(dfeat) # only dfeat2 should be approximated # because Percentile needs all values feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0] feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist( ) == feature_matrix_approx[dfeat2.get_name()].tolist() feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 'ms'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) for f in [p, dfeat, agg_feat]: for fm1, fm2 in combinations([ feature_matrix_approx, feature_matrix_small_approx, feature_matrix_no_approx ], 2): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_make_agg_feat_of_identity_variable(entityset, backend): agg_feat = Sum(entityset['log']['value'], parent_entity=entityset['sessions']) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 50)
def test_make_agg_feat_of_identity_variable(entityset, backend): agg_feat = Sum(entityset['log']['value'], parent_entity=entityset['sessions']) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 50)
def test_make_agg_feat_of_agg_feat(entityset, backend): log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) customer_sum_feat = Sum(log_count_feat, parent_entity=entityset['customers']) pandas_backend = backend([customer_sum_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[customer_sum_feat.get_name()][0] assert (v == 10)
def test_make_agg_feat_of_agg_feat(entityset, backend): log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) customer_sum_feat = Sum(log_count_feat, parent_entity=entityset['customers']) pandas_backend = backend([customer_sum_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[customer_sum_feat.get_name()][0] assert (v == 10)
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_set_data_path(es): key = "primitive_data_folder" # Don't change orig_path orig_path = config.get(key) new_path = "/example/new/directory" filename = "test.csv" # Test that default path works sum_prim = Sum() assert sum_prim.get_filepath(filename) == os.path.join(orig_path, filename) # Test that new path works config.set({key: new_path}) assert sum_prim.get_filepath(filename) == os.path.join(new_path, filename) # Test that new path with trailing / works new_path += "/" config.set({key: new_path}) assert sum_prim.get_filepath(filename) == os.path.join(new_path, filename) # Test that the path is correct on newly defined feature sum_prim2 = Sum() assert sum_prim2.get_filepath(filename) == os.path.join(new_path, filename) # Ensure path was reset config.set({key: orig_path}) assert config.get(key) == orig_path
def test_get_dependencies(es): f = Feature(es['log']['value']) agg1 = Sum(f, es['sessions']) agg2 = Sum(agg1, es['customers']) d1 = Feature(agg2, es['sessions']) shallow = d1.get_dependencies(deep=False, ignored=None) deep = d1.get_dependencies(deep=True, ignored=None) ignored = set([agg1.hash()]) deep_ignored = d1.get_dependencies(deep=True, ignored=ignored) assert [s.hash() for s in shallow] == [agg2.hash()] assert [d.hash() for d in deep] == [agg2.hash(), agg1.hash(), f.hash()] assert [d.hash() for d in deep_ignored] == [agg2.hash()]
def test_string_time_values_in_cutoff_time(entityset): times = ['2011-04-09 10:31:27', '2011-04-09 10:30:18'] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 0]}) agg_feature = Sum(entityset['log']['value'], entityset['customers']) with pytest.raises(TypeError): calculate_feature_matrix([agg_feature], entityset, cutoff_time=cutoff_time)
def test_to_dictionary_agg(es): primitive = Sum() actual = ft.Feature( es["customers"].ww["age"], primitive=primitive, parent_dataframe_name="cohorts" ).to_dictionary() expected = { "type": "AggregationFeature", "dependencies": ["customers: age"], "arguments": { "name": "SUM(customers.age)", "base_features": ["customers: age"], "relationship_path": [ { "parent_dataframe_name": "cohorts", "child_dataframe_name": "customers", "parent_column_name": "cohort", "child_column_name": "cohort", } ], "primitive": primitive, "where": None, "use_previous": None, }, } assert expected == actual
def test_to_dictionary_where(es): primitive = Sum() actual = ft.Feature( es["log"].ww["value"], parent_dataframe_name="sessions", where=ft.IdentityFeature(es["log"].ww["value"]) == 2, primitive=primitive, ).to_dictionary() expected = { "type": "AggregationFeature", "dependencies": ["log: value", "log: value = 2"], "arguments": { "name": "SUM(log.value WHERE value = 2)", "base_features": ["log: value"], "relationship_path": [ { "parent_dataframe_name": "sessions", "child_dataframe_name": "log", "parent_column_name": "id", "child_column_name": "session_id", } ], "primitive": primitive, "where": "log: value = 2", "use_previous": None, }, } assert expected == actual
def test_approximate_time_split_returns_the_same_result(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:07:30'), pd.Timestamp('2011-04-09 10:07:40')], 'instance_id': [0, 0]}) feature_matrix_at_once = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) divided_matrices = [] separate_cutoff = [cutoff_df.iloc[0:1], cutoff_df.iloc[1:]] # Make sure indexes are different # Not that this step is unecessary and done to showcase the issue here separate_cutoff[0].index = [0] separate_cutoff[1].index = [1] for ct in separate_cutoff: fm = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=ct) divided_matrices.append(fm) feature_matrix_from_split = pd.concat(divided_matrices) assert feature_matrix_from_split.shape == feature_matrix_at_once.shape for i1, i2 in zip(feature_matrix_at_once.index, feature_matrix_from_split.index): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2) for c in feature_matrix_from_split: for i1, i2 in zip(feature_matrix_at_once[c], feature_matrix_from_split[c]): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_cfm_no_cutoff_time_index(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat4 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat4, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=False, approximate=Timedelta(12, 's'), cutoff_time=[datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)]) assert feature_matrix.index.name == 'id' assert feature_matrix.index.values.tolist() == [0, 2] assert feature_matrix[dfeat.get_name()].tolist() == [10, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=False, approximate=Timedelta(10, 's'), cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_2.index.name == 'id' assert feature_matrix_2.index.tolist() == [0, 2] assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_override_cmp(es): # P TODO: return count = Count(es['log']['value'], es['sessions']) _sum = Sum(es['log']['value'], es['sessions']) gt_lo = count > 1 gt_other = count > _sum ge_lo = count >= 1 ge_other = count >= _sum lt_hi = count < 10 lt_other = count < _sum le_hi = count <= 10 le_other = count <= _sum ne_lo = count != 1 ne_other = count != _sum to_test = [[True, True, False], [False, False, True], [True, True, True], [False, False, True], [True, True, True], [True, True, False], [True, True, True], [True, True, False]] features = [ gt_lo, gt_other, ge_lo, ge_other, lt_hi, lt_other, le_hi, le_other, ne_lo, ne_other ] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test
def test_two_kinds_of_dependents(es): v = Feature(es['log']['value']) product = Feature(es['log']['product_id']) agg = Sum(v, es['customers'], where=product == 'coke zero') p = Percentile(agg) g = Absolute(agg) agg2 = Sum(v, es['sessions'], where=product == 'coke zero') # Adding this feature in tests line 218 in pandas_backend # where we remove columns in result_frame that already exist # in the output entity_frames in preparation for pd.concat # In a prior version, this failed because we changed the result_frame # variable itself, rather than making a new variable _result_frame. # When len(output_frames) > 1, the second iteration won't have # all the necessary columns because they were removed in the first agg3 = Sum(agg2, es['customers']) pandas_backend = PandasBackend(es, [p, g, agg3]) df = pandas_backend.calculate_all_features([0, 1], None) assert df[p.get_name()].tolist() == [0.5, 1.0] assert df[g.get_name()].tolist() == [15, 26]
def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['log']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]
def test_get_depth(es): log_id_feat = es['log']['id'] customer_id_feat = es['customers']['id'] count_logs = Count(log_id_feat, parent_entity=es['sessions']) sum_count_logs = Sum(count_logs, parent_entity=es['customers']) num_logs_greater_than_5 = sum_count_logs > 5 count_customers = Count(customer_id_feat, parent_entity=es[u'régions'], where=num_logs_greater_than_5) num_customers_region = Feature(count_customers, es["customers"]) depth = num_customers_region.get_depth() assert depth == 5
def test_get_dependencies(es): f = Feature(es['log']['value']) agg1 = Sum(f, es['sessions']) agg2 = Sum(agg1, es['customers']) d1 = Feature(agg2, es['sessions']) shallow = d1.get_dependencies(deep=False, ignored=None) deep = d1.get_dependencies(deep=True, ignored=None) ignored = set([agg1.hash()]) deep_ignored = d1.get_dependencies(deep=True, ignored=ignored) assert [s.hash() for s in shallow] == [agg2.hash()] assert [d.hash() for d in deep] == [agg2.hash(), agg1.hash(), f.hash()] assert [d.hash() for d in deep_ignored] == [agg2.hash()]
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_approximate_multiple_instances_per_cutoff_time(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(1, 'week'), cutoff_time=cutoff_time, chunk_size="cutoff time") assert feature_matrix.shape[0] == 2 assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0 assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_direct_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['customers']) d = Feature(agg, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2 true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0) true_p = true_p[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_empty_path_approximate_partial(entityset): es = copy.deepcopy(entityset) es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2] agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], es, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) vals1 = feature_matrix[dfeat.get_name()].tolist() assert vals1[0] == 7 assert np.isnan(vals1[1]) assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_approx_base_feature_is_also_first_class_feature(entityset): es = entityset log_to_products = DirectFeature(es['products']['rating'], es['log']) # This should still be computed properly agg_feat = Min(log_to_products, es['sessions']) customer_agg_feat = Sum(agg_feat, es['customers']) # This is to be approximated sess_to_cust = DirectFeature(customer_agg_feat, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) vals1 = feature_matrix[sess_to_cust.get_name()].tolist() assert vals1 == [8.5, 7] vals2 = feature_matrix[agg_feat.get_name()].tolist() assert vals2 == [4, 1.5]
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]