Exemple #1
0
def test_make_transform_sets_kwargs_correctly(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self, base_feature_names):
        return u"%s.isin(%s)" % (base_feature_names[0],
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in, [ColumnSchema()],
        ColumnSchema(logical_type=Boolean),
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin_1_list = ["toothpaste", "coke_zero"]
    isin_1_base_f = Feature(es['log'].ww['product_id'])
    isin_1 = Feature(isin_1_base_f,
                     primitive=IsIn(list_of_outputs=isin_1_list))
    isin_2_list = ["coke_zero"]
    isin_2_base_f = Feature(es['log'].ww['session_id'])
    isin_2 = Feature(isin_2_base_f,
                     primitive=IsIn(list_of_outputs=isin_2_list))
    assert isin_1_base_f == isin_1.base_features[0]
    assert isin_1_list == isin_1.primitive.kwargs['list_of_outputs']
    assert isin_2_base_f == isin_2.base_features[0]
    assert isin_2_list == isin_2.primitive.kwargs['list_of_outputs']
def test_feature_takes_timedelta_string(es):
    feature = Feature(
        Feature(es["log"].ww["id"]),
        parent_dataframe_name="customers",
        use_previous="1 day",
        primitive=Count,
    )
    assert feature.use_previous == Timedelta(1, "d")
def test_encode_features_topn(entityset):
    topn = Feature(entityset['log']['product_id'],
                   parent_entity=entityset['customers'],
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=entityset,
                                 instance_ids=[0, 1, 2],
                                 target_entity="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.hash() in [feat.hash() for feat in feature_defs_enc]
    for name in topn.get_feature_names():
        assert name in features_enc.columns
def test_encode_features_topn(pd_es):
    topn = Feature(Feature(pd_es['log'].ww['product_id']),
                   parent_dataframe_name='customers',
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=pd_es,
                                 instance_ids=[0, 1, 2],
                                 target_dataframe_name="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.unique_name() in [
        feat.unique_name() for feat in feature_defs_enc
    ]
    for name in topn.get_feature_names():
        assert name in features_enc.columns
        assert features_enc.columns.tolist().count(name) == 1
Exemple #5
0
def test_direct_of_multi_output_transform_feat(es):
    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [Datetime]
        return_type = Numeric
        number_output_features = 6

        def get_function(self):
            def test_f(x):
                times = pd.Series(x)
                units = ["year", "month", "day", "hour", "minute", "second"]
                return [times.apply(lambda x: getattr(x, unit)) for unit in units]
            return test_f

    join_time_split = Feature(es["customers"]["signup_date"],
                              primitive=TestTime)
    alt_features = [Feature(es["customers"]["signup_date"], primitive=Year),
                    Feature(es["customers"]["signup_date"], primitive=Month),
                    Feature(es["customers"]["signup_date"], primitive=Day),
                    Feature(es["customers"]["signup_date"], primitive=Hour),
                    Feature(es["customers"]["signup_date"], primitive=Minute),
                    Feature(es["customers"]["signup_date"], primitive=Second)]
    fm, fl = dfs(
        entityset=es,
        target_entity="sessions",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second])

    # Get column names of for multi feature and normal features
    subnames = DirectFeature(join_time_split, es["sessions"]).get_feature_names()
    altnames = [DirectFeature(f, es["sessions"]).get_name() for f in alt_features]

    # Check values are equal between
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()
def test_direct_rename_multioutput(es):
    n_common = Feature(es['log'].ww['product_id'],
                       parent_dataframe_name='customers',
                       primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, 'sessions')
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name(
    ) == copy_feat.base_features[0].generate_name()
    assert feat.dataframe_name == copy_feat.dataframe_name
Exemple #7
0
def test_direct_rename_multioutput(es):
    n_common = Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    feat = DirectFeature(n_common, "sessions")
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert (feat.base_features[0].generate_name() ==
            copy_feat.base_features[0].generate_name())
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_identity(es):
    device = Feature(es['sessions'].ww['device_type'])
    d = DirectFeature(base_feature=device, child_dataframe_name='log')

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index='id', sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.KOALAS.value:
        expected = ['0', '1']
    else:
        expected = [0, 1]
    assert v == expected
Exemple #9
0
def test_direct_from_identity(es):
    device = Feature(es["sessions"].ww["device_type"])
    d = DirectFeature(base_feature=device, child_dataframe_name="log")

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index="id", sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.SPARK.value:
        expected = ["0", "1"]
    else:
        expected = [0, 1]
    assert v == expected
Exemple #10
0
def test_direct_of_multi_output_transform_feat(es):
    # TODO: Update to work with Dask and Spark
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail("Custom primitive is not compatible with Dask or Spark")

    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [ColumnSchema(logical_type=Datetime)]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 6

        def get_function(self):
            def test_f(x):
                times = pd.Series(x)
                units = ["year", "month", "day", "hour", "minute", "second"]
                return [
                    times.apply(lambda x: getattr(x, unit)) for unit in units
                ]

            return test_f

    base_feature = IdentityFeature(es["customers"].ww["signup_date"])
    join_time_split = Feature(base_feature, primitive=TestTime)
    alt_features = [
        Feature(base_feature, primitive=Year),
        Feature(base_feature, primitive=Month),
        Feature(base_feature, primitive=Day),
        Feature(base_feature, primitive=Hour),
        Feature(base_feature, primitive=Minute),
        Feature(base_feature, primitive=Second),
    ]
    fm, fl = dfs(
        entityset=es,
        target_dataframe_name="sessions",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second],
    )

    # Get column names of for multi feature and normal features
    subnames = DirectFeature(join_time_split, "sessions").get_feature_names()
    altnames = [DirectFeature(f, "sessions").get_name() for f in alt_features]

    # Check values are equal between
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()
def test_feature_takes_timedelta_string(es):
    feature = Feature(Feature(es['log'].ww['id']), parent_dataframe_name='customers',
                      use_previous="1 day", primitive=Count)
    assert feature.use_previous == Timedelta(1, 'd')