def test_custom_primitive_time_as_arg(es): def time_since_last(values, time): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=True) assert TimeSinceLast.name == "time_since_last" f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = ft.calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def test_time_since_last_custom(pd_es): def time_since_last(values, time=None): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive( time_since_last, [ColumnSchema(logical_type=Datetime, semantic_tags={'time_index'})], ColumnSchema(semantic_tags={'numeric'}), name="time_since_last", uses_calc_time=True) f = ft.Feature(pd_es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=TimeSinceLast) fm = ft.calculate_feature_matrix([f], entityset=pd_es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): TimeSinceLast = make_agg_primitive(time_since_last, [ ColumnSchema(logical_type=Datetime, semantic_tags={'time_index'}) ], ColumnSchema( semantic_tags={'numeric'}), uses_calc_time=False)
def test_custom_primitive_time_as_arg(es): def time_since_last(values, time): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=True) assert TimeSinceLast.name == "time_since_last" f = ft.Feature(es["log"]["datetime"], parent_entity=es["customers"], primitive=TimeSinceLast) fm = ft.calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # TODO: Update to work with Dask and Koalas if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities): pytest.xfail( "Cannot use primitives made with make_agg_primitives with Dask or Koalas EntitySets" ) # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum) f_max = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Max) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum) f_max = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Max) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum) f_max = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Max) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum) f_max = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Max) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_make_three_most_common(pd_es): def pd_top3(x): array = np.array(x.value_counts()[:3].index) if len(array) < 3: filler = np.full(3 - len(array), np.nan) array = np.append(array, filler) return array NMostCommoner = make_agg_primitive(function=pd_top3, input_types=[Discrete], return_type=Discrete, number_output_features=3) fm, features = ft.dfs(entityset=pd_es, target_entity="customers", instance_ids=[0, 1, 2], agg_primitives=[NMostCommoner], trans_primitives=[]) df = fm[["PD_TOP3(log.product_id)[%s]" % i for i in range(3)]] assert set(df.iloc[0].values[:2]) == set([ 'coke zero', 'toothpaste' ]) # coke zero and toothpaste have same number of occurrences assert df.iloc[0].values[2] in ['car', 'brown bag' ] # so just check that the top two match assert df.iloc[1].reset_index(drop=True).equals( pd.Series(['coke zero', 'Haribo sugar-free gummy bears', np.nan])) assert df.iloc[2].reset_index(drop=True).equals( pd.Series(['taco clock', np.nan, np.nan]))
def test_override_multi_feature_names(pd_es): def gen_custom_names(primitive, base_feature_names, relationship_path_name, parent_entity_id, where_str, use_prev_str): base_string = 'Custom_%s({}.{})'.format(parent_entity_id, base_feature_names) return [base_string % i for i in range(primitive.number_output_features)] def pd_top3(x): array = np.array(x.value_counts()[:3].index) if len(array) < 3: filler = np.full(3 - len(array), np.nan) array = np.append(array, filler) return array num_features = 3 NMostCommoner = make_agg_primitive(function=pd_top3, input_types=[Numeric], return_type=Discrete, number_output_features=num_features, cls_attributes={"generate_names": gen_custom_names}) fm, features = ft.dfs(entityset=pd_es, target_entity="products", instance_ids=[0, 1, 2], agg_primitives=[NMostCommoner], trans_primitives=[]) expected_names = [] base_names = [['value'], ['value_2'], ['value_many_nans']] for name in base_names: expected_names += gen_custom_names(NMostCommoner, name, None, 'products', None, None) for name in expected_names: assert name in fm.columns
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_custom_primitive_multiple_inputs(es): def mean_sunday(numeric, datetime): ''' Finds the mean of non-null values of a feature that occurred on Sundays ''' days = pd.DatetimeIndex(datetime).weekday.values df = pd.DataFrame({'numeric': numeric, 'time': days}) return df[df['time'] == 6]['numeric'].mean() MeanSunday = make_agg_primitive(function=mean_sunday, input_types=[Numeric, Datetime], return_type=Numeric) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[]) mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None]) iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value) for x, y in iterator: assert ((pd.isnull(x) and pd.isnull(y)) or (x == y)) es.add_interesting_values() mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None]) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[], where_primitives=[MeanSunday]) where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)" for x, y in zip(fm[where_feat], mean_sunday_value_priority_0): assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_count_null_and_make_agg_primitive(pd_es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self, base_feature_names, relationship_path_name, parent_dataframe_name, where_str, use_prev_str): return u"COUNT(%s%s%s)" % (relationship_path_name, where_str, use_prev_str) Count = make_agg_primitive( count_func, [[ColumnSchema(semantic_tags={'foreign_key'})], [ColumnSchema()]], ColumnSchema(semantic_tags={'numeric'}), name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = ft.Feature(pd_es['log'].ww['value'], parent_dataframe_name='sessions', primitive=Count(count_null=True)) feature_matrix = ft.calculate_feature_matrix([count_null], entityset=pd_es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
def test_custom_primitive_multiple_inputs(es): def mean_sunday(numeric, datetime): ''' Finds the mean of non-null values of a feature that occurred on Sundays ''' days = pd.DatetimeIndex(datetime).weekday.values df = pd.DataFrame({'numeric': numeric, 'time': days}) return df[df['time'] == 6]['numeric'].mean() MeanSunday = make_agg_primitive(function=mean_sunday, input_types=[Numeric, Datetime], return_type=Numeric) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[]) mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None]) iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value) for x, y in iterator: assert ((pd.isnull(x) and pd.isnull(y)) or (x == y)) es.add_interesting_values() mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None]) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[], where_primitives=[MeanSunday]) where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)" for x, y in zip(fm[where_feat], mean_sunday_value_priority_0): assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_make_three_most_common(es): def pd_top3(x): array = np.array(x.value_counts()[:3].index) if len(array) < 3: filler = np.full(3 - len(array), np.nan) array = np.append(array, filler) return array NMostCommoner = make_agg_primitive(function=pd_top3, input_types=[Discrete], return_type=Discrete, number_output_features=3) fm, features = ft.dfs(entityset=es, target_entity="customers", agg_primitives=[NMostCommoner], trans_primitives=[]) true_results = pd.DataFrame( [['coke zero', 'toothpaste', "car"], ['coke zero', 'Haribo sugar-free gummy bears', np.nan], ['taco clock', np.nan, np.nan]]) df = fm[["PD_TOP3(log.product_id)__%s" % i for i in range(3)]] for i in range(df.shape[0]): if i == 0: # coke zero and toothpaste have same number of occurrences # so just check that the top two match assert set(true_results.iloc[i].values[:2]) == set( df.iloc[i].values[:2]) assert df.iloc[0].values[2] in ("brown bag", "car") else: for i1, i2 in zip(true_results.iloc[i], df.iloc[i]): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self, base_feature_names, relationship_path_name, parent_entity_id, where_str, use_prev_str): return u"COUNT(%s%s%s)" % (relationship_path_name, where_str, use_prev_str) Count = make_agg_primitive( count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Count(count_null=True)) feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
def test_uses_calc_time(): def time_since_last(values, time=None): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, name="time_since_last", uses_calc_time=True) primitive = TimeSinceLast() datetimes = pd.Series([datetime(2015, 6, 7), datetime(2015, 6, 6)]) answer = 86400.0 assert answer == primitive(datetimes, time=datetime(2015, 6, 8))
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_custom_primitive_default_kwargs(es): def sum_n_times(numeric, n=1): return np.nan_to_num(numeric).sum(dtype=np.float) * n SumNTimes = make_agg_primitive(function=sum_n_times, input_types=[Numeric], return_type=Numeric) sum_n_1_n = 1 sum_n_1_base_f = ft.Feature(es['log']['value']) sum_n_1 = ft.Feature([sum_n_1_base_f], parent_entity=es['sessions'], primitive=SumNTimes(n=sum_n_1_n)) sum_n_2_n = 2 sum_n_2_base_f = ft.Feature(es['log']['value_2']) sum_n_2 = ft.Feature([sum_n_2_base_f], parent_entity=es['sessions'], primitive=SumNTimes(n=sum_n_2_n)) assert sum_n_1_base_f == sum_n_1.base_features[0] assert sum_n_1_n == sum_n_1.primitive.kwargs['n'] assert sum_n_2_base_f == sum_n_2.base_features[0] assert sum_n_2_n == sum_n_2.primitive.kwargs['n']
def test_custom_primitive_default_kwargs(es): def sum_n_times(numeric, n=1): return np.nan_to_num(numeric).sum(dtype=np.float) * n SumNTimes = make_agg_primitive(function=sum_n_times, input_types=[Numeric], return_type=Numeric) sum_n_1_n = 1 sum_n_1_base_f = Feature(es['log']['value']) sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n) sum_n_2_n = 2 sum_n_2_base_f = Feature(es['log']['value_2']) sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n) assert sum_n_1_base_f == sum_n_1.base_features[0] assert sum_n_1_n == sum_n_1.kwargs['n'] assert sum_n_2_base_f == sum_n_2.base_features[0] assert sum_n_2_n == sum_n_2.kwargs['n']
def test_custom_primitive_default_kwargs(es): def sum_n_times(numeric, n=1): return np.nan_to_num(numeric).sum(dtype=np.float) * n SumNTimes = make_agg_primitive( function=sum_n_times, input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'})) sum_n_1_n = 1 sum_n_1_base_f = ft.Feature(es['log'].ww['value']) sum_n_1 = ft.Feature([sum_n_1_base_f], parent_dataframe_name='sessions', primitive=SumNTimes(n=sum_n_1_n)) sum_n_2_n = 2 sum_n_2_base_f = ft.Feature(es['log'].ww['value_2']) sum_n_2 = ft.Feature([sum_n_2_base_f], parent_dataframe_name='sessions', primitive=SumNTimes(n=sum_n_2_n)) assert sum_n_1_base_f == sum_n_1.base_features[0] assert sum_n_1_n == sum_n_1.primitive.kwargs['n'] assert sum_n_2_base_f == sum_n_2.base_features[0] assert sum_n_2_n == sum_n_2.primitive.kwargs['n']
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self): where_str = self._where_str() use_prev_str = self._use_prev_str() return u"COUNT(%s%s%s)" % (self.child_entity.id, where_str, use_prev_str) Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = Count(es['log']['value'], es['sessions'], count_null=True) feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # TODO: Update to work with Dask and Koalas if es.dataframe_type != Library.PANDAS.value: pytest.xfail( "Cannot use primitives made with make_agg_primitives with Dask or Koalas EntitySets" ) # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive( custom_primitive, input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive( custom_primitive, input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), name="max") f_sum = ft.Feature(es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum) f_max = ft.Feature(es["log"].ww["value"], parent_dataframe_name="customers", primitive=Max) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive( lambda x: x.sum(), input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), name="sum") Max = make_agg_primitive( lambda x: x.max(), input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), name="max") f_sum = ft.Feature(es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum) f_max = ft.Feature(es["log"].ww["value"], parent_dataframe_name="customers", primitive=Max) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
from woodwork.column_schema import ColumnSchema from featuretools.primitives.base import make_agg_primitive CustomSum = make_agg_primitive( lambda x: sum(x), name="CustomSum", input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}))
from featuretools.primitives.base import make_agg_primitive from featuretools.variable_types import Numeric CustomMean = make_agg_primitive(lambda x: sum(x) / len(x), name="CustomMean", input_types=[Numeric], return_type=Numeric)
from featuretools.primitives.base import make_agg_primitive from featuretools.variable_types import Numeric CustomMax = make_agg_primitive(lambda x: max(x), name="CustomMax", input_types=[Numeric], return_type=Numeric)
from featuretools.primitives.base import make_agg_primitive from featuretools.variable_types import Numeric CustomSum = make_agg_primitive(lambda x: sum(x), name="CustomSum", input_types=[Numeric], return_type=Numeric)