def test_arithmetic_of_identity(es): logs = es['log'] to_test = [(AddNumeric, [0., 7., 14., 21.]), (SubtractNumeric, [0, 3, 6, 9]), (MultiplyNumeric, [0, 10, 40, 90]), (DivideNumeric, [np.nan, 2.5, 2.5, 2.5])] features = [] for test in to_test: features.append( ft.Feature([logs['value'], logs['value_2']], primitive=test[0])) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2, 3]) for i, test in enumerate(to_test[:-1]): v = df[features[i].get_name()].values.tolist() assert v == test[1] i, test = 3, to_test[-1] v = df[features[i].get_name()].values.tolist() assert (np.isnan(v[0])) assert v[1:] == test[1][1:]
def test_make_agg_feat_using_prev_time(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], use_previous=Timedelta(10, 's'), primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 10), feature_set=feature_set) df = calculator.run([0]) v = df[agg_feat.get_name()][0] assert (v == 2) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = calculator.run([0]) v = df[agg_feat.get_name()][0] assert (v == 1)
def test_make_dfeat_of_agg_feat_on_self(es, backend): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es['customers']['id'], parent_entity=es[u'régions'], primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_entity=es['customers']) pandas_backend = backend([num_customers_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_get_filepath(es): class Mod4(TransformPrimitive): '''Return base feature modulo 4''' name = "mod4" input_types = [Numeric] return_type = Numeric def get_function(self): filepath = self.get_filepath("featuretools_unit_test_example.csv") reference = pd.read_csv(filepath, header=None, squeeze=True) def map_to_word(x): def _map(x): if pd.isnull(x): return x return reference[int(x) % 4] return pd.Series(x).apply(_map) return map_to_word feat = ft.Feature(es['log']['value'], primitive=Mod4) df = ft.calculate_feature_matrix(features=[feat], entityset=es, instance_ids=range(17)) assert pd.isnull(df["MOD4(value)"][15]) assert df["MOD4(value)"][0] == 0 assert df["MOD4(value)"][14] == 2 fm, fl = ft.dfs(entityset=es, target_entity="log", agg_primitives=[], trans_primitives=[Mod4]) assert fm["MOD4(value)"][0] == 0 assert fm["MOD4(value)"][14] == 2 assert pd.isnull(fm["MOD4(value)"][15])
def test_make_agg_feat_using_prev_time(es): agg_feat = ft.Feature(es['log'].ww['id'], parent_dataframe_name='sessions', use_previous=Timedelta(10, 's'), primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 10), feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 2) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 1)
def test_one_hot_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() feature_matrix['countrycode'][0] = np.nan enc = Encoder(method='one_hot') fm_encoded = enc.fit_transform(feature_matrix, features) encoder = OneHotEnc(value='coke zero') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [0, 0, 1, 1] assert (encoded == encoded_results).all() encoder = OneHotEnc(value=np.nan) encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero', np.nan]) encoded_results = [0, 0, 0, 0, 1] assert (encoded == encoded_results).all() f1_1 = ft.Feature([f1], primitive=OneHotEnc('coke zero')) f1_2 = ft.Feature([f1], primitive=OneHotEnc('car')) f1_3 = ft.Feature([f1], primitive=OneHotEnc('toothpaste')) f4_1 = ft.Feature([f4], primitive=OneHotEnc('US')) f4_2 = ft.Feature([f4], primitive=OneHotEnc('AL')) f4_3 = ft.Feature([f4], primitive=OneHotEnc(np.nan)) features_encoded = [f1_1, f1_2, f1_3, f2, f3, f4_1, f4_2, f4_3] assert len(features_encoded) == len(enc.get_features()) for i in range(len(features_encoded)): assert features_encoded[i].unique_name() == enc.get_features()[i].unique_name() features_encoded = enc.get_features() feature_matrix = ft.calculate_feature_matrix(features_encoded, es, instance_ids=[6, 7]) data = {'product_id = coke zero': [0, 0], 'product_id = car': [0, 0], 'product_id = toothpaste': [1, 1], 'purchased': [True, True], 'value': [1.0, 2.0], 'countrycode = US': [0, 0], 'countrycode = AL': [1, 1], 'countrycode = nan': [0, 0]} fm_encoded = pd.DataFrame(data, index=[6, 7]) assert feature_matrix.eq(fm_encoded).all().all()
def test_haversine_with_nan(pd_es): # Check some `nan` values df = pd_es['log'] df['latlong'][0] = np.nan df['latlong'][1] = (10, np.nan) pd_es.replace_dataframe(dataframe_name='log', df=df) log_latlong_feat = ft.Feature(pd_es['log'].ww['latlong']) log_latlong_feat2 = ft.Feature(pd_es['log'].ww['latlong2']) haversine = ft.Feature([log_latlong_feat, log_latlong_feat2], primitive=Haversine) features = [haversine] df = ft.calculate_feature_matrix(entityset=pd_es, features=features) values = df[haversine.get_name()].values real = [ np.nan, np.nan, 1045.32190304, 1554.56176802, 2047.3294327, 0, 138.16578931, 276.20524822, 413.99185444, 0, 0, 525.318462, 0, 741.57941183, 1467.52760175, np.nan, np.nan ] assert np.allclose(values, real, atol=0.0001, equal_nan=True) # Check all `nan` values df = pd_es['log'] df['latlong2'] = np.nan pd_es.replace_dataframe(dataframe_name='log', df=df) log_latlong_feat = ft.Feature(pd_es['log'].ww['latlong']) log_latlong_feat2 = ft.Feature(pd_es['log'].ww['latlong2']) haversine = ft.Feature([log_latlong_feat, log_latlong_feat2], primitive=Haversine) features = [haversine] df = ft.calculate_feature_matrix(entityset=pd_es, features=features) values = df[haversine.get_name()].values real = [np.nan] * pd_es['log'].shape[0] assert np.allclose(values, real, atol=0.0001, equal_nan=True)
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = "MyFeature" original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = ( [new_name] if len(original_names) == 1 else [new_name + "[{}]".format(i) for i in range(len(original_names))] ) check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es["log"].ww["value"]) assert identity_original.get_name() == "value" value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, "customers", primitive) assert agg_original.get_name() == "MAX(log.value)" direct_original = ft.DirectFeature( ft.IdentityFeature(es["customers"].ww["age"]), "sessions" ) assert direct_original.get_name() == "customers.age" primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == "value * 2" zipcode = ft.IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode ) assert groupby_original.get_name() == "CUM_SUM(value) by zipcode" multioutput_original = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)" featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]" feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original, ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_cum_sum_numpy_group_on_nan(pd_es): class CumSumNumpy(TransformPrimitive): """Returns the cumulative sum after grouping""" name = "cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) uses_full_dataframe = True def get_function(self): def cum_sum(values): return values.cumsum().values return cum_sum log_value_feat = ft.IdentityFeature(pd_es["log"].ww["value"]) pd_es["log"]["product_id"] = ( ["coke zero"] * 3 + ["car"] * 2 + ["toothpaste"] * 3 + ["brown bag"] * 2 + ["shoes"] + [np.nan] * 4 + ["coke_zero"] * 2 ) pd_es["log"]["value"][16] = 10 cum_sum = ft.Feature( log_value_feat, groupby=ft.IdentityFeature(pd_es["log"].ww["product_id"]), primitive=CumSumNumpy, ) assert cum_sum.get_name() == "CUM_SUM(value) by product_id" features = [cum_sum] df = ft.calculate_feature_matrix( entityset=pd_es, features=features, instance_ids=range(17) ) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 17 cum_sum_values = [ 0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan, np.nan, 10, ] assert len(cvalues) == len(cum_sum_values) for i, v in enumerate(cum_sum_values): if np.isnan(v): assert np.isnan(cvalues[i]) else: assert v == cvalues[i]
def test_tranform_stack_agg(es): topn = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=3)) with pytest.raises(AssertionError): ft.Feature(topn, primitive=Percentile)
def test_scalar_overrides(es): value = ft.Feature(es["log"].ww["value"]) feats = [ AddNumericScalar, SubtractNumericScalar, MultiplyNumericScalar, DivideNumericScalar, ModuloNumericScalar, GreaterThanScalar, LessThanScalar, EqualScalar, NotEqualScalar, GreaterThanEqualToScalar, LessThanEqualToScalar, ] overrides = [ value + 2, value - 2, value * 2, value / 2, value % 2, value > 2, value < 2, value == 2, value != 2, value >= 2, value <= 2, ] for feat in feats: f = ft.Feature(value, primitive=feat(2)) o = overrides.pop(0) assert o.unique_name() == f.unique_name() value2 = ft.Feature(es["log"].ww["value_2"]) reverse_feats = [ AddNumericScalar, ScalarSubtractNumericFeature, MultiplyNumericScalar, DivideByFeature, ModuloByFeature, GreaterThanScalar, LessThanScalar, EqualScalar, NotEqualScalar, GreaterThanEqualToScalar, LessThanEqualToScalar, ] reverse_overrides = [ 2 + value2, 2 - value2, 2 * value2, 2 / value2, 2 % value2, 2 < value2, 2 > value2, 2 == value2, 2 != value2, 2 <= value2, 2 >= value2, ] for feat in reverse_feats: f = ft.Feature(value2, primitive=feat(2)) o = reverse_overrides.pop(0) assert o.unique_name() == f.unique_name()
def test_empty_child_dataframe(parent_child): parent_df, child_df = parent_child if not isinstance(parent_df, pd.DataFrame): parent_vtypes = {'id': variable_types.Index} child_vtypes = { 'id': variable_types.Index, 'parent_id': variable_types.Numeric, 'time_index': variable_types.Datetime, 'value': variable_types.Numeric, 'cat': variable_types.Categorical } else: parent_vtypes = None child_vtypes = None es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id", variable_types=parent_vtypes) es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index", variable_types=child_vtypes) es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create multi-output agg feature n_most_common = ft.Feature(es["child"]['cat'], parent_entity=es["parent"], primitive=NMostCommon) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) n_most_common_where = ft.Feature(es["child"]['cat'], parent_entity=es["parent"], where=where, primitive=NMostCommon) if isinstance(parent_df, pd.DataFrame): features = [ count, count_where, trend, trend_where, n_most_common, n_most_common_where ] names = [ count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name(), *n_most_common.get_feature_names(), *n_most_common_where.get_feature_names() ] values = [ 0, 0, np.nan, np.nan, *np.full(n_most_common.number_output_features, np.nan), *np.full(n_most_common_where.number_output_features, np.nan) ] else: features = [count, count_where] names = [count.get_name(), count_where.get_name()] values = [0, 0] # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("12/31/2017")) fm = to_pandas(fm) assert_array_equal(fm[names], [values]) # cutoff time after all rows, but where clause filters all rows if isinstance(parent_df, pd.DataFrame): features = [count_where, trend_where, n_most_common_where] names = [ count_where.get_name(), trend_where.get_name(), *n_most_common_where.get_feature_names() ] values = [ 0, np.nan, *np.full(n_most_common_where.number_output_features, np.nan) ] else: features = [count_where] names = [count_where.get_name()] values = [0] fm2 = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("1/4/2018")) fm2 = to_pandas(fm2) assert_array_equal(fm2[names], [values])
def test_multi_output_base_error_agg(es): three_common = NMostCommon(3) tc = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name="sessions", primitive=three_common) error_text = "Cannot stack on whole multi-output feature." with pytest.raises(ValueError, match=error_text): ft.Feature(tc, parent_dataframe_name='customers', primitive=NumUnique)
# print(es['transactions'].variables) #------------------do deep feature synthesis(dfs)-------------------------- # feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='products') feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity='products', agg_primitives=['count'], # aggregation function apply between entities trans_primitives=['month'], # transform function apply to target_entity max_depth=1) print(feature_matrix.columns.tolist()) print(feature_matrix.head()) print(feature_defs) print('-------seed feature( used defined feature) ---') expansive_purchase = ft.Feature(es['transactions']['amount']) > 100 feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='products', agg_primitives=['percent_true'], seed_features=[expansive_purchase]) print(feature_matrix.columns.tolist()) print(feature_matrix.head()) print(feature_defs) print('---------where primitives-------') es['transactions']['date_of_birth'].interesting_values = [ '1986-08-18', '1986-08-19' ] #'where_primitives' to specify agg primitives in agg_primitives feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity='products',
def test_empty_child_dataframe(parent_child): parent_df, child_df = parent_child child_ltypes = { 'parent_id': Integer, 'time_index': Datetime, 'value': Double, 'cat': Categorical } es = ft.EntitySet(id="blah") es.add_dataframe(dataframe_name="parent", dataframe=parent_df, index="id") es.add_dataframe(dataframe_name="child", dataframe=child_df, index="id", time_index="time_index", logical_types=child_ltypes) es.add_relationship("parent", "id", "child", "parent_id") # create regular agg count = ft.Feature(es["child"].ww["id"], parent_dataframe_name="parent", primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([ ft.Feature(es["child"].ww["value"]), ft.Feature(es["child"].ww['time_index']) ], parent_dataframe_name="parent", primitive=Trend) # create multi-output agg feature n_most_common = ft.Feature(es["child"].ww["cat"], parent_dataframe_name="parent", primitive=NMostCommon) # create aggs with where where = ft.Feature(es["child"].ww["value"]) == 1 count_where = ft.Feature(es["child"].ww["id"], parent_dataframe_name="parent", where=where, primitive=Count) trend_where = ft.Feature([ ft.Feature(es["child"].ww["value"]), ft.Feature(es["child"].ww["time_index"]) ], parent_dataframe_name="parent", where=where, primitive=Trend) n_most_common_where = ft.Feature(es["child"].ww["cat"], parent_dataframe_name="parent", where=where, primitive=NMostCommon) if isinstance(parent_df, pd.DataFrame): features = [ count, count_where, trend, trend_where, n_most_common, n_most_common_where ] data = { count.get_name(): pd.Series([0], dtype="Int64"), count_where.get_name(): pd.Series([0], dtype="Int64"), trend.get_name(): pd.Series([np.nan], dtype="float"), trend_where.get_name(): pd.Series([np.nan], dtype="float") } for name in n_most_common.get_feature_names(): data[name] = pd.Series([np.nan], dtype="category") for name in n_most_common_where.get_feature_names(): data[name] = pd.Series([np.nan], dtype="category") else: features = [count, count_where] data = { count.get_name(): pd.Series([0], dtype="Int64"), count_where.get_name(): pd.Series([0], dtype="Int64") } answer = pd.DataFrame(data) # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("12/31/2017")) fm = to_pandas(fm) for column in data.keys(): pd.testing.assert_series_equal(fm[column], answer[column], check_names=False, check_index=False) # cutoff time after all rows, but where clause filters all rows if isinstance(parent_df, pd.DataFrame): features = [count_where, trend_where, n_most_common_where] data = { count_where.get_name(): pd.Series([0], dtype="Int64"), trend_where.get_name(): pd.Series([np.nan], dtype="float") } for name in n_most_common_where.get_feature_names(): data[name] = pd.Series([np.nan], dtype="category") else: features = [count_where] data = {count_where.get_name(): pd.Series([0], dtype="Int64")} answer = pd.DataFrame(data) fm2 = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("1/4/2018")) fm2 = to_pandas(fm2) for column in data.keys(): pd.testing.assert_series_equal(fm[column], answer[column], check_names=False, check_index=False)
def test_return_type_inference_datetime_time_index(es): last = ft.Feature(es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=Last) assert isinstance(last.column_schema.logical_type, Datetime)
def test_return_type_inference_index(es): last = ft.Feature(es["log"].ww["id"], parent_dataframe_name="customers", primitive=Last) assert "index" not in last.column_schema.semantic_tags assert isinstance(last.column_schema.logical_type, Integer)
def test_return_type_inference_direct_feature(es): mode = ft.Feature(es["log"].ww["priority_level"], parent_dataframe_name="customers", primitive=Mode) mode_session = ft.Feature(mode, "sessions") assert mode_session.column_schema == IdentityFeature(es["log"].ww["priority_level"]).column_schema
def test_return_type_inference(es): mode = ft.Feature(es["log"].ww["priority_level"], parent_dataframe_name="customers", primitive=Mode) assert mode.column_schema == IdentityFeature(es["log"].ww["priority_level"]).column_schema
def test_rename(es): feat = ft.Feature(es['log'].ww['id'], parent_dataframe_name='sessions', primitive=Count) new_name = 'session_test' new_names = ['session_test'] check_rename(feat, new_name, new_names)
def test_arithmetic_two_vals_fails(es): error_text = "Not a feature" with pytest.raises(Exception, match=error_text): ft.Feature([2, 2], primitive=AddNumeric)
def test_feature_takes_timedelta_string(es): feature = ft.Feature(es['log']['id'], parent_entity=es['customers'], use_previous="1 day", primitive=Count) assert feature.use_previous == Timedelta(1, 'd')
def test_squared(es): feature = ft.Feature(es['log']['value']) squared = feature * feature assert len(squared.base_features) == 2 assert squared.base_features[0].unique_name( ) == squared.base_features[1].unique_name()
def test_return_type_inference_numeric_time_index(int_es): last = ft.Feature(int_es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=Last) assert "numeric" in last.column_schema.semantic_tags
def test_handles_primitive_function_name_uniqueness(entityset): class SumTimesN(AggregationPrimitive): name = "sum_times_n" input_types = [Numeric] return_type = Numeric def __init__(self, n): self.n = n def get_function(self): def my_function(values): return values.sum() * self.n return my_function def generate_name(self, base_feature_names, child_entity_id, parent_entity_id, where_str, use_prev_str): base_features_str = ", ".join(base_feature_names) return u"%s(%s.%s%s%s, n=%s)" % (self.name.upper(), child_entity_id, base_features_str, where_str, use_prev_str, self.n) # works as expected f1 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=SumTimesN(n=1)) fm = ft.calculate_feature_matrix(features=[f1], entityset=entityset) value_sum = pd.Series([56, 26, 0]) assert all(fm[f1.get_name()].sort_index() == value_sum) # works as expected f2 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=SumTimesN(n=2)) fm = ft.calculate_feature_matrix(features=[f2], entityset=entityset) double_value_sum = pd.Series([112, 52, 0]) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # same primitive, same variable, different args fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=entityset) assert all(fm[f1.get_name()].sort_index() == value_sum) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # different primtives, same function returned by get_function, # different base features f3 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum) f4 = ft.Feature(entityset["log"]["purchased"], parent_entity=entityset["customers"], primitive=NumTrue) fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=entityset) purchased_sum = pd.Series([10, 1, 1]) assert all(fm[f3.get_name()].sort_index() == value_sum) assert all(fm[f4.get_name()].sort_index() == purchased_sum)\ # different primtives, same function returned by get_function, # same base feature class Sum1(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum1" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self): return np.sum class Sum2(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum2" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self): return np.sum class Sum3(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum3" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self): return np.sum f5 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum1) f6 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum2) f7 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum3) fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=entityset) assert all(fm[f5.get_name()].sort_index() == value_sum) assert all(fm[f6.get_name()].sort_index() == value_sum) assert all(fm[f7.get_name()].sort_index() == value_sum)
def test_return_type_inference(es): mode = ft.Feature(es["log"]["priority_level"], parent_entity=es["customers"], primitive=Mode) assert mode.variable_type == es["log"]["priority_level"].__class__
def test_empty_child_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({ "id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2] }) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) # cutoff time before all rows fm = ft.calculate_feature_matrix( entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017")) names = [ count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name() ] assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]]) # cutoff time after all rows, but where clause filters all rows fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018")) names = [count_where.get_name(), trend_where.get_name()] assert_array_equal(fm2[names], [[0, np.nan]])
def test_return_type_inference_direct_feature(es): mode = ft.Feature(es["log"]["priority_level"], parent_entity=es["customers"], primitive=Mode) mode_session = ft.Feature(mode, es["sessions"]) assert mode_session.variable_type == es["log"]["priority_level"].__class__
def test_handles_primitive_function_name_uniqueness(es): if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities): pytest.xfail( "Fails with Dask and Koalas due conflicting aggregation primitive names" ) class SumTimesN(AggregationPrimitive): name = "sum_times_n" input_types = [Numeric] return_type = Numeric def __init__(self, n): self.n = n def get_function(self, agg_type='pandas'): def my_function(values): return values.sum() * self.n return my_function # works as expected f1 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=SumTimesN(n=1)) fm = ft.calculate_feature_matrix(features=[f1], entityset=es) value_sum = pd.Series([56, 26, 0]) assert all(fm[f1.get_name()].sort_index() == value_sum) # works as expected f2 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=SumTimesN(n=2)) fm = ft.calculate_feature_matrix(features=[f2], entityset=es) double_value_sum = pd.Series([112, 52, 0]) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # same primitive, same variable, different args fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=es) assert all(fm[f1.get_name()].sort_index() == value_sum) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # different primtives, same function returned by get_function, # different base features f3 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum) f4 = ft.Feature(es["log"]["purchased"], parent_entity=es["customers"], primitive=NumTrue) fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=es) purchased_sum = pd.Series([10, 1, 1]) assert all(fm[f3.get_name()].sort_index() == value_sum) assert all(fm[f4.get_name()].sort_index() == purchased_sum)\ # different primtives, same function returned by get_function, # same base feature class Sum1(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum1" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self, agg_type='pandas'): return np.sum class Sum2(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum2" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self, agg_type='pandas'): return np.sum class Sum3(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum3" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self, agg_type='pandas'): return np.sum f5 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum1) f6 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum2) f7 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum3) fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=es) assert all(fm[f5.get_name()].sort_index() == value_sum) assert all(fm[f6.get_name()].sort_index() == value_sum) assert all(fm[f7.get_name()].sort_index() == value_sum)
def test_return_type_inference_datetime_time_index(es): last = ft.Feature(es["log"]["datetime"], parent_entity=es["customers"], primitive=Last) assert last.variable_type == Datetime