Esempio n. 1
0
def test_arithmetic_of_identity(es):
    logs = es['log']

    to_test = [(AddNumeric, [0., 7., 14., 21.]),
               (SubtractNumeric, [0, 3, 6, 9]),
               (MultiplyNumeric, [0, 10, 40, 90]),
               (DivideNumeric, [np.nan, 2.5, 2.5, 2.5])]

    features = []
    for test in to_test:
        features.append(
            ft.Feature([logs['value'], logs['value_2']], primitive=test[0]))

    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=[0, 1, 2, 3])

    for i, test in enumerate(to_test[:-1]):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]
    i, test = 3, to_test[-1]
    v = df[features[i].get_name()].values.tolist()
    assert (np.isnan(v[0]))
    assert v[1:] == test[1][1:]
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          use_previous=Timedelta(10, 's'),
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = calculator.run([0])

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run([0])

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
Esempio n. 3
0
def test_make_dfeat_of_agg_feat_on_self(es, backend):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es['customers']['id'],
                                     parent_entity=es[u'régions'],
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_entity=es['customers'])

    pandas_backend = backend([num_customers_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)
def test_get_filepath(es):
    class Mod4(TransformPrimitive):
        '''Return base feature modulo 4'''
        name = "mod4"
        input_types = [Numeric]
        return_type = Numeric

        def get_function(self):
            filepath = self.get_filepath("featuretools_unit_test_example.csv")
            reference = pd.read_csv(filepath, header=None, squeeze=True)

            def map_to_word(x):
                def _map(x):
                    if pd.isnull(x):
                        return x
                    return reference[int(x) % 4]
                return pd.Series(x).apply(_map)
            return map_to_word

    feat = ft.Feature(es['log']['value'], primitive=Mod4)
    df = ft.calculate_feature_matrix(features=[feat],
                                     entityset=es,
                                     instance_ids=range(17))

    assert pd.isnull(df["MOD4(value)"][15])
    assert df["MOD4(value)"][0] == 0
    assert df["MOD4(value)"][14] == 2

    fm, fl = ft.dfs(entityset=es,
                    target_entity="log",
                    agg_primitives=[],
                    trans_primitives=[Mod4])

    assert fm["MOD4(value)"][0] == 0
    assert fm["MOD4(value)"][14] == 2
    assert pd.isnull(fm["MOD4(value)"][15])
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(es['log'].ww['id'],
                          parent_dataframe_name='sessions',
                          use_previous=Timedelta(10, 's'),
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
def test_one_hot_encoding():
    feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix()

    feature_matrix['countrycode'][0] = np.nan
    enc = Encoder(method='one_hot')
    fm_encoded = enc.fit_transform(feature_matrix, features)

    encoder = OneHotEnc(value='coke zero')
    encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero'])
    encoded_results = [0, 0, 1, 1]
    assert (encoded == encoded_results).all()

    encoder = OneHotEnc(value=np.nan)
    encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero', np.nan])
    encoded_results = [0, 0, 0, 0, 1]
    assert (encoded == encoded_results).all()

    f1_1 = ft.Feature([f1], primitive=OneHotEnc('coke zero'))
    f1_2 = ft.Feature([f1], primitive=OneHotEnc('car'))
    f1_3 = ft.Feature([f1], primitive=OneHotEnc('toothpaste'))

    f4_1 = ft.Feature([f4], primitive=OneHotEnc('US'))
    f4_2 = ft.Feature([f4], primitive=OneHotEnc('AL'))
    f4_3 = ft.Feature([f4], primitive=OneHotEnc(np.nan))
    features_encoded = [f1_1, f1_2, f1_3, f2, f3, f4_1, f4_2, f4_3]
    assert len(features_encoded) == len(enc.get_features())
    for i in range(len(features_encoded)):
        assert features_encoded[i].unique_name() == enc.get_features()[i].unique_name()

    features_encoded = enc.get_features()
    feature_matrix = ft.calculate_feature_matrix(features_encoded, es, instance_ids=[6, 7])
    data = {'product_id = coke zero': [0, 0],
            'product_id = car': [0, 0],
            'product_id = toothpaste': [1, 1],
            'purchased': [True, True],
            'value': [1.0, 2.0],
            'countrycode = US': [0, 0],
            'countrycode = AL': [1, 1],
            'countrycode = nan': [0, 0]}
    fm_encoded = pd.DataFrame(data, index=[6, 7])
    assert feature_matrix.eq(fm_encoded).all().all()
def test_haversine_with_nan(pd_es):
    # Check some `nan` values
    df = pd_es['log']
    df['latlong'][0] = np.nan
    df['latlong'][1] = (10, np.nan)
    pd_es.replace_dataframe(dataframe_name='log', df=df)
    log_latlong_feat = ft.Feature(pd_es['log'].ww['latlong'])
    log_latlong_feat2 = ft.Feature(pd_es['log'].ww['latlong2'])
    haversine = ft.Feature([log_latlong_feat, log_latlong_feat2],
                           primitive=Haversine)
    features = [haversine]

    df = ft.calculate_feature_matrix(entityset=pd_es, features=features)
    values = df[haversine.get_name()].values
    real = [
        np.nan, np.nan, 1045.32190304, 1554.56176802, 2047.3294327, 0,
        138.16578931, 276.20524822, 413.99185444, 0, 0, 525.318462, 0,
        741.57941183, 1467.52760175, np.nan, np.nan
    ]

    assert np.allclose(values, real, atol=0.0001, equal_nan=True)

    # Check all `nan` values
    df = pd_es['log']
    df['latlong2'] = np.nan
    pd_es.replace_dataframe(dataframe_name='log', df=df)
    log_latlong_feat = ft.Feature(pd_es['log'].ww['latlong'])
    log_latlong_feat2 = ft.Feature(pd_es['log'].ww['latlong2'])
    haversine = ft.Feature([log_latlong_feat, log_latlong_feat2],
                           primitive=Haversine)
    features = [haversine]

    df = ft.calculate_feature_matrix(entityset=pd_es, features=features)
    values = df[haversine.get_name()].values
    real = [np.nan] * pd_es['log'].shape[0]

    assert np.allclose(values, real, atol=0.0001, equal_nan=True)
Esempio n. 8
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = "MyFeature"
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = (
            [new_name]
            if len(original_names) == 1
            else [new_name + "[{}]".format(i) for i in range(len(original_names))]
        )
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es["log"].ww["value"])
    assert identity_original.get_name() == "value"

    value = ft.IdentityFeature(es["log"].ww["value"])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, "customers", primitive)
    assert agg_original.get_name() == "MAX(log.value)"

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es["customers"].ww["age"]), "sessions"
    )
    assert direct_original.get_name() == "customers.age"

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == "value * 2"

    zipcode = ft.IdentityFeature(es["log"].ww["zipcode"])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode
    )
    assert groupby_original.get_name() == "CUM_SUM(value) by zipcode"

    multioutput_original = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)"

    featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0)
    assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]"

    feature_type_list = [
        identity_original,
        agg_original,
        direct_original,
        transform_original,
        groupby_original,
        multioutput_original,
        featureslice_original,
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
def test_cum_sum_numpy_group_on_nan(pd_es):
    class CumSumNumpy(TransformPrimitive):
        """Returns the cumulative sum after grouping"""

        name = "cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        uses_full_dataframe = True

        def get_function(self):
            def cum_sum(values):
                return values.cumsum().values

            return cum_sum

    log_value_feat = ft.IdentityFeature(pd_es["log"].ww["value"])
    pd_es["log"]["product_id"] = (
        ["coke zero"] * 3
        + ["car"] * 2
        + ["toothpaste"] * 3
        + ["brown bag"] * 2
        + ["shoes"]
        + [np.nan] * 4
        + ["coke_zero"] * 2
    )
    pd_es["log"]["value"][16] = 10
    cum_sum = ft.Feature(
        log_value_feat,
        groupby=ft.IdentityFeature(pd_es["log"].ww["product_id"]),
        primitive=CumSumNumpy,
    )
    assert cum_sum.get_name() == "CUM_SUM(value) by product_id"
    features = [cum_sum]
    df = ft.calculate_feature_matrix(
        entityset=pd_es, features=features, instance_ids=range(17)
    )
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 17
    cum_sum_values = [
        0,
        5,
        15,
        15,
        35,
        0,
        1,
        3,
        3,
        3,
        0,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        10,
    ]

    assert len(cvalues) == len(cum_sum_values)
    for i, v in enumerate(cum_sum_values):
        if np.isnan(v):
            assert np.isnan(cvalues[i])
        else:
            assert v == cvalues[i]
def test_tranform_stack_agg(es):
    topn = ft.Feature(es['log']['product_id'],
                      parent_entity=es['customers'],
                      primitive=NMostCommon(n=3))
    with pytest.raises(AssertionError):
        ft.Feature(topn, primitive=Percentile)
Esempio n. 11
0
def test_scalar_overrides(es):
    value = ft.Feature(es["log"].ww["value"])

    feats = [
        AddNumericScalar,
        SubtractNumericScalar,
        MultiplyNumericScalar,
        DivideNumericScalar,
        ModuloNumericScalar,
        GreaterThanScalar,
        LessThanScalar,
        EqualScalar,
        NotEqualScalar,
        GreaterThanEqualToScalar,
        LessThanEqualToScalar,
    ]

    overrides = [
        value + 2,
        value - 2,
        value * 2,
        value / 2,
        value % 2,
        value > 2,
        value < 2,
        value == 2,
        value != 2,
        value >= 2,
        value <= 2,
    ]

    for feat in feats:
        f = ft.Feature(value, primitive=feat(2))
        o = overrides.pop(0)
        assert o.unique_name() == f.unique_name()

    value2 = ft.Feature(es["log"].ww["value_2"])

    reverse_feats = [
        AddNumericScalar,
        ScalarSubtractNumericFeature,
        MultiplyNumericScalar,
        DivideByFeature,
        ModuloByFeature,
        GreaterThanScalar,
        LessThanScalar,
        EqualScalar,
        NotEqualScalar,
        GreaterThanEqualToScalar,
        LessThanEqualToScalar,
    ]
    reverse_overrides = [
        2 + value2,
        2 - value2,
        2 * value2,
        2 / value2,
        2 % value2,
        2 < value2,
        2 > value2,
        2 == value2,
        2 != value2,
        2 <= value2,
        2 >= value2,
    ]
    for feat in reverse_feats:
        f = ft.Feature(value2, primitive=feat(2))
        o = reverse_overrides.pop(0)
        assert o.unique_name() == f.unique_name()
Esempio n. 12
0
def test_empty_child_dataframe(parent_child):
    parent_df, child_df = parent_child
    if not isinstance(parent_df, pd.DataFrame):
        parent_vtypes = {'id': variable_types.Index}
        child_vtypes = {
            'id': variable_types.Index,
            'parent_id': variable_types.Numeric,
            'time_index': variable_types.Datetime,
            'value': variable_types.Numeric,
            'cat': variable_types.Categorical
        }
    else:
        parent_vtypes = None
        child_vtypes = None
    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id",
                             variable_types=parent_vtypes)
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index",
                             variable_types=child_vtypes)
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'],
                       parent_entity=es["parent"],
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                       parent_entity=es["parent"],
                       primitive=Trend)

    # create multi-output agg feature
    n_most_common = ft.Feature(es["child"]['cat'],
                               parent_entity=es["parent"],
                               primitive=NMostCommon)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Trend)
    n_most_common_where = ft.Feature(es["child"]['cat'],
                                     parent_entity=es["parent"],
                                     where=where,
                                     primitive=NMostCommon)

    if isinstance(parent_df, pd.DataFrame):
        features = [
            count, count_where, trend, trend_where, n_most_common,
            n_most_common_where
        ]
        names = [
            count.get_name(),
            count_where.get_name(),
            trend.get_name(),
            trend_where.get_name(), *n_most_common.get_feature_names(),
            *n_most_common_where.get_feature_names()
        ]
        values = [
            0, 0, np.nan, np.nan,
            *np.full(n_most_common.number_output_features, np.nan),
            *np.full(n_most_common_where.number_output_features, np.nan)
        ]
    else:
        features = [count, count_where]
        names = [count.get_name(), count_where.get_name()]
        values = [0, 0]

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     cutoff_time=pd.Timestamp("12/31/2017"))
    fm = to_pandas(fm)

    assert_array_equal(fm[names], [values])

    # cutoff time after all rows, but where clause filters all rows
    if isinstance(parent_df, pd.DataFrame):
        features = [count_where, trend_where, n_most_common_where]
        names = [
            count_where.get_name(),
            trend_where.get_name(), *n_most_common_where.get_feature_names()
        ]
        values = [
            0, np.nan,
            *np.full(n_most_common_where.number_output_features, np.nan)
        ]
    else:
        features = [count_where]
        names = [count_where.get_name()]
        values = [0]

    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=features,
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    fm2 = to_pandas(fm2)

    assert_array_equal(fm2[names], [values])
Esempio n. 13
0
def test_multi_output_base_error_agg(es):
    three_common = NMostCommon(3)
    tc = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name="sessions", primitive=three_common)
    error_text = "Cannot stack on whole multi-output feature."
    with pytest.raises(ValueError, match=error_text):
        ft.Feature(tc, parent_dataframe_name='customers', primitive=NumUnique)
# print(es['transactions'].variables)

#------------------do deep feature synthesis(dfs)--------------------------
# feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='products')
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity='products',
    agg_primitives=['count'],  # aggregation function apply between entities
    trans_primitives=['month'],  # transform function apply to target_entity
    max_depth=1)
print(feature_matrix.columns.tolist())
print(feature_matrix.head())
print(feature_defs)

print('-------seed feature( used defined feature) ---')
expansive_purchase = ft.Feature(es['transactions']['amount']) > 100
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_entity='products',
                                      agg_primitives=['percent_true'],
                                      seed_features=[expansive_purchase])
print(feature_matrix.columns.tolist())
print(feature_matrix.head())
print(feature_defs)

print('---------where primitives-------')
es['transactions']['date_of_birth'].interesting_values = [
    '1986-08-18', '1986-08-19'
]  #'where_primitives' to specify agg primitives in agg_primitives
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity='products',
def test_empty_child_dataframe(parent_child):
    parent_df, child_df = parent_child
    child_ltypes = {
        'parent_id': Integer,
        'time_index': Datetime,
        'value': Double,
        'cat': Categorical
    }

    es = ft.EntitySet(id="blah")
    es.add_dataframe(dataframe_name="parent", dataframe=parent_df, index="id")
    es.add_dataframe(dataframe_name="child",
                     dataframe=child_df,
                     index="id",
                     time_index="time_index",
                     logical_types=child_ltypes)
    es.add_relationship("parent", "id", "child", "parent_id")

    # create regular agg
    count = ft.Feature(es["child"].ww["id"],
                       parent_dataframe_name="parent",
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([
        ft.Feature(es["child"].ww["value"]),
        ft.Feature(es["child"].ww['time_index'])
    ],
                       parent_dataframe_name="parent",
                       primitive=Trend)

    # create multi-output agg feature
    n_most_common = ft.Feature(es["child"].ww["cat"],
                               parent_dataframe_name="parent",
                               primitive=NMostCommon)

    # create aggs with where
    where = ft.Feature(es["child"].ww["value"]) == 1
    count_where = ft.Feature(es["child"].ww["id"],
                             parent_dataframe_name="parent",
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([
        ft.Feature(es["child"].ww["value"]),
        ft.Feature(es["child"].ww["time_index"])
    ],
                             parent_dataframe_name="parent",
                             where=where,
                             primitive=Trend)
    n_most_common_where = ft.Feature(es["child"].ww["cat"],
                                     parent_dataframe_name="parent",
                                     where=where,
                                     primitive=NMostCommon)

    if isinstance(parent_df, pd.DataFrame):
        features = [
            count, count_where, trend, trend_where, n_most_common,
            n_most_common_where
        ]
        data = {
            count.get_name(): pd.Series([0], dtype="Int64"),
            count_where.get_name(): pd.Series([0], dtype="Int64"),
            trend.get_name(): pd.Series([np.nan], dtype="float"),
            trend_where.get_name(): pd.Series([np.nan], dtype="float")
        }
        for name in n_most_common.get_feature_names():
            data[name] = pd.Series([np.nan], dtype="category")
        for name in n_most_common_where.get_feature_names():
            data[name] = pd.Series([np.nan], dtype="category")
    else:
        features = [count, count_where]
        data = {
            count.get_name(): pd.Series([0], dtype="Int64"),
            count_where.get_name(): pd.Series([0], dtype="Int64")
        }

    answer = pd.DataFrame(data)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     cutoff_time=pd.Timestamp("12/31/2017"))
    fm = to_pandas(fm)

    for column in data.keys():
        pd.testing.assert_series_equal(fm[column],
                                       answer[column],
                                       check_names=False,
                                       check_index=False)

    # cutoff time after all rows, but where clause filters all rows
    if isinstance(parent_df, pd.DataFrame):
        features = [count_where, trend_where, n_most_common_where]
        data = {
            count_where.get_name(): pd.Series([0], dtype="Int64"),
            trend_where.get_name(): pd.Series([np.nan], dtype="float")
        }
        for name in n_most_common_where.get_feature_names():
            data[name] = pd.Series([np.nan], dtype="category")
    else:
        features = [count_where]
        data = {count_where.get_name(): pd.Series([0], dtype="Int64")}
    answer = pd.DataFrame(data)

    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=features,
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    fm2 = to_pandas(fm2)

    for column in data.keys():
        pd.testing.assert_series_equal(fm[column],
                                       answer[column],
                                       check_names=False,
                                       check_index=False)
Esempio n. 16
0
def test_return_type_inference_datetime_time_index(es):
    last = ft.Feature(es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=Last)
    assert isinstance(last.column_schema.logical_type, Datetime)
Esempio n. 17
0
def test_return_type_inference_index(es):
    last = ft.Feature(es["log"].ww["id"], parent_dataframe_name="customers", primitive=Last)
    assert "index" not in last.column_schema.semantic_tags
    assert isinstance(last.column_schema.logical_type, Integer)
Esempio n. 18
0
def test_return_type_inference_direct_feature(es):
    mode = ft.Feature(es["log"].ww["priority_level"], parent_dataframe_name="customers", primitive=Mode)
    mode_session = ft.Feature(mode, "sessions")
    assert mode_session.column_schema == IdentityFeature(es["log"].ww["priority_level"]).column_schema
Esempio n. 19
0
def test_return_type_inference(es):
    mode = ft.Feature(es["log"].ww["priority_level"], parent_dataframe_name="customers", primitive=Mode)
    assert mode.column_schema == IdentityFeature(es["log"].ww["priority_level"]).column_schema
Esempio n. 20
0
def test_rename(es):
    feat = ft.Feature(es['log'].ww['id'], parent_dataframe_name='sessions', primitive=Count)
    new_name = 'session_test'
    new_names = ['session_test']
    check_rename(feat, new_name, new_names)
def test_arithmetic_two_vals_fails(es):
    error_text = "Not a feature"
    with pytest.raises(Exception, match=error_text):
        ft.Feature([2, 2], primitive=AddNumeric)
Esempio n. 22
0
def test_feature_takes_timedelta_string(es):
    feature = ft.Feature(es['log']['id'],
                         parent_entity=es['customers'],
                         use_previous="1 day",
                         primitive=Count)
    assert feature.use_previous == Timedelta(1, 'd')
def test_squared(es):
    feature = ft.Feature(es['log']['value'])
    squared = feature * feature
    assert len(squared.base_features) == 2
    assert squared.base_features[0].unique_name(
    ) == squared.base_features[1].unique_name()
Esempio n. 24
0
def test_return_type_inference_numeric_time_index(int_es):
    last = ft.Feature(int_es["log"].ww["datetime"], parent_dataframe_name="customers", primitive=Last)
    assert "numeric" in last.column_schema.semantic_tags
Esempio n. 25
0
def test_handles_primitive_function_name_uniqueness(entityset):
    class SumTimesN(AggregationPrimitive):
        name = "sum_times_n"
        input_types = [Numeric]
        return_type = Numeric

        def __init__(self, n):
            self.n = n

        def get_function(self):
            def my_function(values):
                return values.sum() * self.n

            return my_function

        def generate_name(self, base_feature_names, child_entity_id,
                          parent_entity_id, where_str, use_prev_str):
            base_features_str = ", ".join(base_feature_names)
            return u"%s(%s.%s%s%s, n=%s)" % (self.name.upper(),
                                             child_entity_id,
                                             base_features_str,
                                             where_str, use_prev_str, self.n)

    # works as expected
    f1 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=SumTimesN(n=1))
    fm = ft.calculate_feature_matrix(features=[f1], entityset=entityset)
    value_sum = pd.Series([56, 26, 0])
    assert all(fm[f1.get_name()].sort_index() == value_sum)

    # works as expected
    f2 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=SumTimesN(n=2))
    fm = ft.calculate_feature_matrix(features=[f2], entityset=entityset)
    double_value_sum = pd.Series([112, 52, 0])
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # same primitive, same variable, different args
    fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=entityset)
    assert all(fm[f1.get_name()].sort_index() == value_sum)
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # different primtives, same function returned by get_function,
    # different base features
    f3 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum)
    f4 = ft.Feature(entityset["log"]["purchased"],
                    parent_entity=entityset["customers"],
                    primitive=NumTrue)
    fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=entityset)
    purchased_sum = pd.Series([10, 1, 1])
    assert all(fm[f3.get_name()].sort_index() == value_sum)
    assert all(fm[f4.get_name()].sort_index() == purchased_sum)\


    # different primtives, same function returned by get_function,
    # same base feature
    class Sum1(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum1"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self):
            return np.sum

    class Sum2(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum2"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self):
            return np.sum

    class Sum3(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum3"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self):
            return np.sum

    f5 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum1)
    f6 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum2)
    f7 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum3)
    fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=entityset)
    assert all(fm[f5.get_name()].sort_index() == value_sum)
    assert all(fm[f6.get_name()].sort_index() == value_sum)
    assert all(fm[f7.get_name()].sort_index() == value_sum)
def test_return_type_inference(es):
    mode = ft.Feature(es["log"]["priority_level"],
                      parent_entity=es["customers"],
                      primitive=Mode)
    assert mode.variable_type == es["log"]["priority_level"].__class__
def test_empty_child_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({
        "id": [1, 2, 3],
        "parent_id": [1, 1, 1],
        "time_index":
        pd.date_range(start='1/1/2018', periods=3),
        "value": [10, 5, 2]
    })

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id")
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index")
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'],
                       parent_entity=es["parent"],
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                       parent_entity=es["parent"],
                       primitive=Trend)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Trend)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(
        entityset=es,
        features=[count, count_where, trend, trend_where],
        cutoff_time=pd.Timestamp("12/31/2017"))
    names = [
        count.get_name(),
        count_where.get_name(),
        trend.get_name(),
        trend_where.get_name()
    ]
    assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]])

    # cutoff time after all rows, but where clause filters all rows
    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=[count_where, trend_where],
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    names = [count_where.get_name(), trend_where.get_name()]
    assert_array_equal(fm2[names], [[0, np.nan]])
def test_return_type_inference_direct_feature(es):
    mode = ft.Feature(es["log"]["priority_level"],
                      parent_entity=es["customers"],
                      primitive=Mode)
    mode_session = ft.Feature(mode, es["sessions"])
    assert mode_session.variable_type == es["log"]["priority_level"].__class__
Esempio n. 29
0
def test_handles_primitive_function_name_uniqueness(es):
    if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities):
        pytest.xfail(
            "Fails with Dask and Koalas due conflicting aggregation primitive names"
        )

    class SumTimesN(AggregationPrimitive):
        name = "sum_times_n"
        input_types = [Numeric]
        return_type = Numeric

        def __init__(self, n):
            self.n = n

        def get_function(self, agg_type='pandas'):
            def my_function(values):
                return values.sum() * self.n

            return my_function

    # works as expected
    f1 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=SumTimesN(n=1))
    fm = ft.calculate_feature_matrix(features=[f1], entityset=es)

    value_sum = pd.Series([56, 26, 0])
    assert all(fm[f1.get_name()].sort_index() == value_sum)

    # works as expected
    f2 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=SumTimesN(n=2))
    fm = ft.calculate_feature_matrix(features=[f2], entityset=es)

    double_value_sum = pd.Series([112, 52, 0])
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # same primitive, same variable, different args
    fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=es)

    assert all(fm[f1.get_name()].sort_index() == value_sum)
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # different primtives, same function returned by get_function,
    # different base features
    f3 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum)
    f4 = ft.Feature(es["log"]["purchased"],
                    parent_entity=es["customers"],
                    primitive=NumTrue)
    fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=es)

    purchased_sum = pd.Series([10, 1, 1])
    assert all(fm[f3.get_name()].sort_index() == value_sum)
    assert all(fm[f4.get_name()].sort_index() == purchased_sum)\

    # different primtives, same function returned by get_function,
    # same base feature
    class Sum1(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum1"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self, agg_type='pandas'):
            return np.sum

    class Sum2(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum2"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self, agg_type='pandas'):
            return np.sum

    class Sum3(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum3"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self, agg_type='pandas'):
            return np.sum

    f5 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum1)
    f6 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum2)
    f7 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum3)
    fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=es)
    assert all(fm[f5.get_name()].sort_index() == value_sum)
    assert all(fm[f6.get_name()].sort_index() == value_sum)
    assert all(fm[f7.get_name()].sort_index() == value_sum)
def test_return_type_inference_datetime_time_index(es):
    last = ft.Feature(es["log"]["datetime"],
                      parent_entity=es["customers"],
                      primitive=Last)
    assert last.variable_type == Datetime