def test_make_transform_restricts_time_arg():
    make_trans_primitive(lambda time: time, [Datetime],
                         Numeric,
                         name="AllowedPrimitive",
                         description="This primitive should be accepted",
                         uses_calc_time=True)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_trans_primitive(lambda time: time, [Datetime],
                             Numeric,
                             name="BadPrimitive",
                             description="This primitive should erorr")
def test_make_transform_sets_kwargs_correctly(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self, base_feature_names):
        return u"%s.isin(%s)" % (base_feature_names[0],
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in, [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin_1_list = ["toothpaste", "coke_zero"]
    isin_1_base_f = ft.Feature(es['log']['product_id'])
    isin_1 = ft.Feature(isin_1_base_f,
                        primitive=IsIn(list_of_outputs=isin_1_list))
    isin_2_list = ["coke_zero"]
    isin_2_base_f = ft.Feature(es['log']['session_id'])
    isin_2 = ft.Feature(isin_2_base_f,
                        primitive=IsIn(list_of_outputs=isin_2_list))
    assert isin_1_base_f == isin_1.base_features[0]
    assert isin_1_list == isin_1.primitive.kwargs['list_of_outputs']
    assert isin_2_base_f == isin_2.base_features[0]
    assert isin_2_list == isin_2.primitive.kwargs['list_of_outputs']
def test_make_transform_sets_kwargs_correctly(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in,
        [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin_1_list = ["toothpaste", "coke_zero"]
    isin_1_base_f = Feature(es['log']['product_id'])
    isin_1 = IsIn(isin_1_base_f, list_of_outputs=isin_1_list)
    isin_2_list = ["coke_zero"]
    isin_2_base_f = Feature(es['log']['session_id'])
    isin_2 = IsIn(isin_2_base_f, list_of_outputs=isin_2_list)
    assert isin_1_base_f == isin_1.base_features[0]
    assert isin_1_list == isin_1.kwargs['list_of_outputs']
    assert isin_2_base_f == isin_2.base_features[0]
    assert isin_2_list == isin_2.kwargs['list_of_outputs']
Esempio n. 4
0
def test_override_multi_feature_names(es):
    def gen_custom_names(primitive, base_feature_names):
        return [
            'Above18(%s)' % base_feature_names,
            'Above21(%s)' % base_feature_names,
            'Above65(%s)' % base_feature_names
        ]

    def is_greater(x):
        return x > 18, x > 21, x > 65

    num_features = 3
    IsGreater = make_trans_primitive(
        function=is_greater,
        input_types=[Numeric],
        return_type=Numeric,
        number_output_features=num_features,
        cls_attributes={"generate_names": gen_custom_names})

    fm, features = ft.dfs(entityset=es,
                          target_entity="customers",
                          instance_ids=[0, 1, 2],
                          agg_primitives=[],
                          trans_primitives=[IsGreater])

    expected_names = gen_custom_names(IsGreater, ['age'])

    for name in expected_names:
        assert name in fm.columns
def test_make_transform_restricts_time_arg():
    make_trans_primitive(
        lambda time: time,
        [Datetime],
        Numeric,
        name="AllowedPrimitive",
        description="This primitive should be accepted",
        uses_calc_time=True)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_trans_primitive(
            lambda time: time,
            [Datetime],
            Numeric,
            name="BadPrimitive",
            description="This primitive should erorr")
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        if isinstance(array, dd.Series):
            return array.isin(list_of_outputs)
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self, base_feature_names):
        return u"%s.isin(%s)" % (base_feature_names[0],
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in, [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = ft.Feature(
        es['log']['product_id'],
        primitive=IsIn(list_of_outputs=["toothpaste", "coke zero"]))
    features = [isin]
    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=range(8))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = ft.Feature(es['log']['product_id']).isin(
        ["toothpaste", "coke zero"])
    features = [isin]
    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=range(8))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = ft.Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=range(8))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return array.isin(list_of_outputs)

    def isin_generate_name(self, base_feature_names):
        return u"%s.isin(%s)" % (base_feature_names[0],
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in, [ColumnSchema()],
        ColumnSchema(logical_type=Boolean),
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = ft.Feature(
        es['log'].ww['product_id'],
        primitive=IsIn(list_of_outputs=["toothpaste", "coke zero"]))
    features = [isin]
    df = to_pandas(ft.calculate_feature_matrix(entityset=es,
                                               features=features,
                                               instance_ids=range(8)),
                   index='id',
                   sort_index=True)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].tolist()
    assert true == v

    isin = ft.Feature(es['log'].ww['product_id']).isin(
        ["toothpaste", "coke zero"])
    features = [isin]
    df = to_pandas(ft.calculate_feature_matrix(entityset=es,
                                               features=features,
                                               instance_ids=range(8)),
                   index='id',
                   sort_index=True)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].tolist()
    assert true == v

    isin = ft.Feature(es['log'].ww['value']).isin([5, 10])
    features = [isin]
    df = to_pandas(ft.calculate_feature_matrix(entityset=es,
                                               features=features,
                                               instance_ids=range(8)),
                   index='id',
                   sort_index=True)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].tolist()
    assert true == v
def test_make_transform_multiple_output_features(pd_es):
    def test_time(x):
        times = pd.Series(x)
        units = ["year", "month", "day", "hour", "minute", "second"]
        return [times.apply(lambda x: getattr(x, unit)) for unit in units]

    def gen_feat_names(self):
        subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"]
        return [
            "Now.%s(%s)" % (subname, self.base_features[0].get_name())
            for subname in subnames
        ]

    TestTime = make_trans_primitive(
        function=test_time,
        input_types=[ColumnSchema(logical_type=Datetime)],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        number_output_features=6,
        cls_attributes={"get_feature_names": gen_feat_names},
    )

    join_time_split = ft.Feature(pd_es["log"].ww["datetime"],
                                 primitive=TestTime)
    alt_features = [
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Year),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Month),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Day),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Hour),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Minute),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Second)
    ]
    fm, fl = ft.dfs(entityset=pd_es,
                    target_dataframe_name="log",
                    agg_primitives=['sum'],
                    trans_primitives=[
                        TestTime, Year, Month, Day, Hour, Minute, Second, Diff
                    ],
                    max_depth=5)

    subnames = join_time_split.get_feature_names()
    altnames = [f.get_name() for f in alt_features]
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()

    for i in range(6):
        f = 'sessions.customers.SUM(log.TEST_TIME(datetime)[%d])' % i
        assert feature_with_name(fl, f)
        assert ('products.DIFF(SUM(log.TEST_TIME(datetime)[%d]))' % i) in fl
def test_groupby_with_multioutput_primitive(pd_es):
    def multi_cum_sum(x):
        return x.cumsum(), x.cummax(), x.cummin()

    num_features = 3
    MultiCumSum = make_trans_primitive(function=multi_cum_sum,
                                       input_types=[ColumnSchema(semantic_tags={'numeric'})],
                                       return_type=ColumnSchema(semantic_tags={'numeric'}),
                                       number_output_features=num_features)

    fm, _ = dfs(entityset=pd_es,
                target_dataframe_name='customers',
                trans_primitives=[],
                agg_primitives=[],
                groupby_trans_primitives=[MultiCumSum, CumSum, CumMax, CumMin])

    # Calculate output in a separate DFS call to make sure the multi-output code
    # does not alter any values
    fm2, _ = dfs(entityset=pd_es,
                 target_dataframe_name='customers',
                 trans_primitives=[],
                 agg_primitives=[],
                 groupby_trans_primitives=[CumSum, CumMax, CumMin])

    answer_cols = [
        ['CUM_SUM(age) by cohort', 'CUM_SUM(age) by région_id'],
        ['CUM_MAX(age) by cohort', 'CUM_MAX(age) by région_id'],
        ['CUM_MIN(age) by cohort', 'CUM_MIN(age) by région_id']
    ]

    for i in range(3):
        # Check that multi-output gives correct answers
        f = 'MULTI_CUM_SUM(age)[%d] by cohort' % i
        assert f in fm.columns
        for x, y in zip(fm[f].values, fm[answer_cols[i][0]].values):
            assert x == y
        f = 'MULTI_CUM_SUM(age)[%d] by région_id' % i
        assert f in fm.columns
        for x, y in zip(fm[f].values, fm[answer_cols[i][1]].values):
            assert x == y
        # Verify single output results are unchanged by inclusion of
        # multi-output primitive
        for x, y in zip(fm[answer_cols[i][0]], fm2[answer_cols[i][0]]):
            assert x == y
        for x, y in zip(fm[answer_cols[i][1]], fm2[answer_cols[i][1]]):
            assert x == y
def test_make_transform_multiple_output_features(es):
    def test_f(x):
        times = pd.Series(x)
        units = ["year", "month", "day", "hour", "minute", "second"]
        return [times.apply(lambda x: getattr(x, unit)) for unit in units]

    def gen_feat_names(self):
        subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"]
        return [
            "Now.%s(%s)" % (subname, self.base_features[0].get_name())
            for subname in subnames
        ]

    TestTime = make_trans_primitive(
        function=test_f,
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
        cls_attributes={"get_feature_names": gen_feat_names},
    )

    join_time_split = ft.Feature(es["log"]["datetime"], primitive=TestTime)
    alt_features = [
        ft.Feature(es["log"]["datetime"], primitive=Year),
        ft.Feature(es["log"]["datetime"], primitive=Month),
        ft.Feature(es["log"]["datetime"], primitive=Day),
        ft.Feature(es["log"]["datetime"], primitive=Hour),
        ft.Feature(es["log"]["datetime"], primitive=Minute),
        ft.Feature(es["log"]["datetime"], primitive=Second)
    ]
    fm, fl = ft.dfs(
        entityset=es,
        target_entity="log",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second])

    subnames = join_time_split.get_feature_names()
    altnames = [f.get_name() for f in alt_features]
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()

    # check no feature stacked on new primitive
    for feature in fl:
        for base_feature in feature.base_features:
            assert base_feature.hash() != join_time_split.hash()
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in,
        [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = IsIn(es['log']['product_id'],
                list_of_outputs=["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in, [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = IsIn(es['log']['product_id'],
                list_of_outputs=["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_groupby_with_multioutput_primitive_custom_names(pd_es):
    def gen_custom_names(primitive, base_feature_names):
        return ["CUSTOM_SUM", "CUSTOM_MAX", "CUSTOM_MIN"]

    def multi_cum_sum(x):
        return x.cumsum(), x.cummax(), x.cummin()

    num_features = 3
    MultiCumSum = make_trans_primitive(function=multi_cum_sum,
                                       input_types=[ColumnSchema(semantic_tags={'numeric'})],
                                       return_type=ColumnSchema(semantic_tags={'numeric'}),
                                       number_output_features=num_features,
                                       cls_attributes={"generate_names": gen_custom_names})

    fm, _ = dfs(entityset=pd_es,
                target_dataframe_name='customers',
                trans_primitives=[],
                agg_primitives=[],
                groupby_trans_primitives=[MultiCumSum, CumSum, CumMax, CumMin])

    answer_cols = [
        ['CUM_SUM(age) by cohort', 'CUM_SUM(age) by région_id'],
        ['CUM_MAX(age) by cohort', 'CUM_MAX(age) by région_id'],
        ['CUM_MIN(age) by cohort', 'CUM_MIN(age) by région_id']
    ]

    expected_names = [
        ['CUSTOM_SUM by cohort', 'CUSTOM_SUM by région_id'],
        ['CUSTOM_MAX by cohort', 'CUSTOM_MAX by région_id'],
        ['CUSTOM_MIN by cohort', 'CUSTOM_MIN by région_id']
    ]

    for i in range(3):
        f = expected_names[i][0]
        assert f in fm.columns
        for x, y in zip(fm[f].values, fm[answer_cols[i][0]].values):
            assert x == y
        f = expected_names[i][1]
        assert f in fm.columns
        for x, y in zip(fm[f].values, fm[answer_cols[i][1]].values):
            assert x == y
Esempio n. 14
0
def test_groupby_multi_output_stacking(pd_es):
    TestTime = make_trans_primitive(
        function=lambda x: x,
        name="test_time",
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
    )

    fl = dfs(entityset=pd_es,
             target_entity="sessions",
             agg_primitives=['sum'],
             groupby_trans_primitives=[TestTime],
             features_only=True,
             max_depth=4)

    for i in range(6):
        f = 'SUM(log.TEST_TIME(datetime)[%d] by product_id)' % i
        assert feature_with_name(fl, f)
        assert ('customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)' %
                i) in fl
Esempio n. 15
0
def test_groupby_multi_output_stacking(es):
    TestTime = make_trans_primitive(
        function=lambda x: x,
        name="test_time",
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
    )

    fl = dfs(
        entityset=es,
        target_entity="sessions",
        agg_primitives=[],
        trans_primitives=[TestTime],
        groupby_trans_primitives=[CumSum],
        features_only=True,
        max_depth=4)

    for i in range(6):
        f = 'customers.CUM_SUM(TEST_TIME(upgrade_date)[%d]) by cohort' % i
        assert feature_with_name(fl, f)
        assert ('customers.CUM_SUM(TEST_TIME(date_of_birth)[%d]) by customer_id' % i) in fl