def test_make_transform_sets_kwargs_correctly(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self, base_feature_names): return u"%s.isin(%s)" % (base_feature_names[0], str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [ColumnSchema()], ColumnSchema(logical_type=Boolean), name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin_1_list = ["toothpaste", "coke_zero"] isin_1_base_f = Feature(es['log'].ww['product_id']) isin_1 = Feature(isin_1_base_f, primitive=IsIn(list_of_outputs=isin_1_list)) isin_2_list = ["coke_zero"] isin_2_base_f = Feature(es['log'].ww['session_id']) isin_2 = Feature(isin_2_base_f, primitive=IsIn(list_of_outputs=isin_2_list)) assert isin_1_base_f == isin_1.base_features[0] assert isin_1_list == isin_1.primitive.kwargs['list_of_outputs'] assert isin_2_base_f == isin_2.base_features[0] assert isin_2_list == isin_2.primitive.kwargs['list_of_outputs']
def test_feature_takes_timedelta_string(es): feature = Feature( Feature(es["log"].ww["id"]), parent_dataframe_name="customers", use_previous="1 day", primitive=Count, ) assert feature.use_previous == Timedelta(1, "d")
def test_encode_features_topn(entityset): topn = Feature(entityset['log']['product_id'], parent_entity=entityset['customers'], primitive=NMostCommon(n=3)) features, feature_defs = dfs(entityset=entityset, instance_ids=[0, 1, 2], target_entity="customers", agg_primitives=[NMostCommon(n=3)]) features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert topn.hash() in [feat.hash() for feat in feature_defs_enc] for name in topn.get_feature_names(): assert name in features_enc.columns
def test_encode_features_topn(pd_es): topn = Feature(Feature(pd_es['log'].ww['product_id']), parent_dataframe_name='customers', primitive=NMostCommon(n=3)) features, feature_defs = dfs(entityset=pd_es, instance_ids=[0, 1, 2], target_dataframe_name="customers", agg_primitives=[NMostCommon(n=3)]) features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert topn.unique_name() in [ feat.unique_name() for feat in feature_defs_enc ] for name in topn.get_feature_names(): assert name in features_enc.columns assert features_enc.columns.tolist().count(name) == 1
def test_direct_of_multi_output_transform_feat(es): class TestTime(TransformPrimitive): name = "test_time" input_types = [Datetime] return_type = Numeric number_output_features = 6 def get_function(self): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] return test_f join_time_split = Feature(es["customers"]["signup_date"], primitive=TestTime) alt_features = [Feature(es["customers"]["signup_date"], primitive=Year), Feature(es["customers"]["signup_date"], primitive=Month), Feature(es["customers"]["signup_date"], primitive=Day), Feature(es["customers"]["signup_date"], primitive=Hour), Feature(es["customers"]["signup_date"], primitive=Minute), Feature(es["customers"]["signup_date"], primitive=Second)] fm, fl = dfs( entityset=es, target_entity="sessions", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second]) # Get column names of for multi feature and normal features subnames = DirectFeature(join_time_split, es["sessions"]).get_feature_names() altnames = [DirectFeature(f, es["sessions"]).get_name() for f in alt_features] # Check values are equal between for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all()
def test_direct_rename_multioutput(es): n_common = Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, 'sessions') copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name( ) == copy_feat.base_features[0].generate_name() assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_rename_multioutput(es): n_common = Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) feat = DirectFeature(n_common, "sessions") copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert (feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()) assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_identity(es): device = Feature(es['sessions'].ww['device_type']) d = DirectFeature(base_feature=device, child_dataframe_name='log') feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index='id', sort_index=True) v = df[d.get_name()].tolist() if es.dataframe_type == Library.KOALAS.value: expected = ['0', '1'] else: expected = [0, 1] assert v == expected
def test_direct_from_identity(es): device = Feature(es["sessions"].ww["device_type"]) d = DirectFeature(base_feature=device, child_dataframe_name="log") feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index="id", sort_index=True) v = df[d.get_name()].tolist() if es.dataframe_type == Library.SPARK.value: expected = ["0", "1"] else: expected = [0, 1] assert v == expected
def test_direct_of_multi_output_transform_feat(es): # TODO: Update to work with Dask and Spark if es.dataframe_type != Library.PANDAS.value: pytest.xfail("Custom primitive is not compatible with Dask or Spark") class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 6 def get_function(self): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [ times.apply(lambda x: getattr(x, unit)) for unit in units ] return test_f base_feature = IdentityFeature(es["customers"].ww["signup_date"]) join_time_split = Feature(base_feature, primitive=TestTime) alt_features = [ Feature(base_feature, primitive=Year), Feature(base_feature, primitive=Month), Feature(base_feature, primitive=Day), Feature(base_feature, primitive=Hour), Feature(base_feature, primitive=Minute), Feature(base_feature, primitive=Second), ] fm, fl = dfs( entityset=es, target_dataframe_name="sessions", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second], ) # Get column names of for multi feature and normal features subnames = DirectFeature(join_time_split, "sessions").get_feature_names() altnames = [DirectFeature(f, "sessions").get_name() for f in alt_features] # Check values are equal between for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all()
def test_feature_takes_timedelta_string(es): feature = Feature(Feature(es['log'].ww['id']), parent_dataframe_name='customers', use_previous="1 day", primitive=Count) assert feature.use_previous == Timedelta(1, 'd')