Esempio n. 1
0
def test_make_identity(es):
    f = IdentityFeature(es["log"].ww["datetime"])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert v == datetime(2011, 4, 9, 10, 30, 0)
Esempio n. 2
0
def test_dependent_percentile(es):
    v = ft.Feature(es['log']['value'])
    p = ft.Feature(v, primitive=Percentile)
    p2 = ft.Feature(p - 1, primitive=Percentile)
    feature_set = FeatureSet([p, p2])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array(range(10, 17)))
    true = es['log'].df[v.get_name()].rank(pct=True)
    true = true.loc[range(10, 17)]
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
Esempio n. 3
0
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_make_dfeat(es):
    f = DirectFeature(es['customers']['age'], child_entity=es['sessions'])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert (v == 33)
def test_make_agg_feat_using_prev_n_events(es):
    agg_feat_1 = ft.Feature(es['log']['value'],
                            parent_entity=es['sessions'],
                            use_previous=Timedelta(1,
                                                   'observations',
                                                   entity=es['log']),
                            primitive=Min)

    agg_feat_2 = ft.Feature(es['log']['value'],
                            parent_entity=es['sessions'],
                            use_previous=Timedelta(3,
                                                   'observations',
                                                   entity=es['log']),
                            primitive=Min)

    assert agg_feat_1.get_name() != agg_feat_2.get_name(), \
        'Features should have different names based on use_previous'

    feature_set = FeatureSet([agg_feat_1, agg_feat_2])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 6),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    # time_last is included by default
    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 5
    assert v2 == 0

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 20
    assert v2 == 10
Esempio n. 6
0
def test_make_dfeat(es):
    f = DirectFeature(ft.Feature(es["customers"].ww["age"]),
                      child_dataframe_name="sessions")

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert v == 33
Esempio n. 7
0
def test_make_3_stacked_agg_feats(df):
    """
    Tests stacking 3 agg features.

    The test specifically uses non numeric indices to test how ancestor columns are handled
    as dataframes are merged together

    """
    if isinstance(df, dd.DataFrame):
        pytest.xfail("normalize_datdataframe fails with dask DataFrame")
    es = ft.EntitySet()
    ltypes = {
        "e1": Categorical,
        "e2": Categorical,
        "e3": Categorical,
        "val": Double
    }
    es.add_dataframe(dataframe=df,
                     index="id",
                     dataframe_name="e0",
                     logical_types=ltypes)

    es.normalize_dataframe(
        base_dataframe_name="e0",
        new_dataframe_name="e1",
        index="e1",
        additional_columns=["e2", "e3"],
    )

    es.normalize_dataframe(
        base_dataframe_name="e1",
        new_dataframe_name="e2",
        index="e2",
        additional_columns=["e3"],
    )

    es.normalize_dataframe(base_dataframe_name="e2",
                           new_dataframe_name="e3",
                           index="e3")

    sum_1 = ft.Feature(es["e0"].ww["val"],
                       parent_dataframe_name="e1",
                       primitive=Sum)
    sum_2 = ft.Feature(sum_1, parent_dataframe_name="e2", primitive=Sum)
    sum_3 = ft.Feature(sum_2, parent_dataframe_name="e3", primitive=Sum)

    feature_set = FeatureSet([sum_3])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array(["z"]))
    v = df[sum_3.get_name()][0]
    assert v == 5
Esempio n. 8
0
def test_returns_order_of_instance_ids(pd_es):
    feature_set = FeatureSet([ft.Feature(pd_es['customers']['age'])])
    calculator = FeatureSetCalculator(pd_es,
                                      time_last=None,
                                      feature_set=feature_set)

    instance_ids = [0, 1, 2]
    assert list(pd_es['customers'].df['id']) != instance_ids

    df = calculator.run(np.array(instance_ids))

    assert list(df.index) == instance_ids
def test_make_agg_feat_of_identity_index_variable(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[agg_feat.get_name()][0]
    assert (v == 5)
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[customer_sum_feat.get_name()][0]
    assert (v == 10)
def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device,
                      child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index='id', sort_index=True)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_direct_squared(es):
    feature = IdentityFeature(es['log']['value'])
    squared = feature * feature
    feature_set = FeatureSet([feature, squared])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    for i, row in df.iterrows():
        assert (row[0] * row[0]) == row[1]
def test_make_agg_feat_of_grandchild_entity(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['customers'],
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
Esempio n. 14
0
def test_agg_percentile(es):
    v = ft.Feature(es['log']['value'])
    p = ft.Feature(v, primitive=Percentile)
    agg = ft.Feature(p, parent_entity=es['sessions'], primitive=Sum)
    feature_set = FeatureSet([agg])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))
    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id').sort_index()
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
        def calc_results(time_last,
                         ids,
                         precalculated_features=None,
                         training_window=None):
            calculator = FeatureSetCalculator(
                entityset,
                feature_set,
                time_last,
                training_window=training_window,
                precalculated_features=precalculated_features)

            matrix = calculator.run(ids)
            return matrix
def test_full_entity_trans_of_agg(es):
    agg_feat = ft.Feature(es['log']['value'], parent_entity=es['customers'],
                          primitive=Sum)
    trans_feat = ft.Feature(agg_feat, primitive=CumSum)

    feature_set = FeatureSet([trans_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([1]))

    v = df[trans_feat.get_name()][1]
    assert v == 82
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          use_previous=Timedelta(10, 's'),
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
def test_make_agg_feat_of_identity_column(es):
    agg_feat = ft.Feature(es['log'].ww['value'],
                          parent_dataframe_name='sessions',
                          primitive=Sum)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 50)
def test_two_kinds_of_dependents(es):
    v = ft.Feature(es['log']['value'])
    product = ft.Feature(es['log']['product_id'])
    agg = ft.Feature(v, parent_entity=es['customers'], where=product == 'coke zero', primitive=Sum)
    p = ft.Feature(agg, primitive=Percentile)
    g = ft.Feature(agg, primitive=Absolute)
    agg2 = ft.Feature(v, parent_entity=es['sessions'], where=product == 'coke zero', primitive=Sum)
    agg3 = ft.Feature(agg2, parent_entity=es['customers'], primitive=Sum)
    feature_set = FeatureSet([p, g, agg3])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))
    assert df[p.get_name()].tolist() == [2. / 3, 1.0]
    assert df[g.get_name()].tolist() == [15, 26]
Esempio n. 21
0
def test_make_agg_feat_using_prev_n_events(es):
    if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities):
        pytest.xfail('Distrubuted entitysets do not support use_previous')
    agg_feat_1 = ft.Feature(es['log']['value'],
                            parent_entity=es['sessions'],
                            use_previous=Timedelta(1, 'observations'),
                            primitive=Min)

    agg_feat_2 = ft.Feature(es['log']['value'],
                            parent_entity=es['sessions'],
                            use_previous=Timedelta(3, 'observations'),
                            primitive=Min)

    assert agg_feat_1.get_name() != agg_feat_2.get_name(), \
        'Features should have different names based on use_previous'

    feature_set = FeatureSet([agg_feat_1, agg_feat_2])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 6),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    # time_last is included by default
    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 5
    assert v2 == 0

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 20
    assert v2 == 10
def test_make_identity(es):
    f = IdentityFeature(es['log']['datetime'])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[f.get_name()][0]
    assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_make_agg_feat_of_grandchild_entity(es):
    agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
        df.index = pd.Int64Index(df.index)
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
def test_full_entity_error_dask(dask_es):
    agg_feat = ft.Feature(dask_es['log']['value'], parent_entity=dask_es['customers'],
                          primitive=Sum)
    trans_feat = ft.Feature(agg_feat, primitive=CumSum)

    feature_set = FeatureSet([trans_feat])
    calculator = FeatureSetCalculator(dask_es,
                                      time_last=None,
                                      feature_set=feature_set)
    error_text = "Cannot use primitives that require full entity with Dask"

    with pytest.raises(ValueError, match=error_text):
        calculator.run(np.array([1]))
def test_with_features_built_from_es_metadata(es):
    metadata = es.metadata
    agg_feat = ft.Feature(metadata['log']['id'],
                          parent_entity=metadata['customers'],
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
Esempio n. 26
0
def test_direct_percentile(es):
    v = ft.Feature(es['customers']['age'])
    p = ft.Feature(v, primitive=Percentile)
    d = ft.Feature(p, es['sessions'])
    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))

    cust_vals = es['customers'].df[[v.get_name()]]
    cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True)
    true_p = cust_vals['percentile'].loc[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_empty_child(es):
    customer_count_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count)
    feature_set = FeatureSet([customer_count_feat])

    # time last before the customer had any events, so child frame is empty
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    assert df["COUNT(log)"].iloc[0] == 0
def test_make_agg_feat_of_grandchild_dataframe(es):
    agg_feat = ft.Feature(es['log'].ww['id'],
                          parent_dataframe_name='customers',
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[agg_feat.get_name()].values[0]
    assert (v == 10)
Esempio n. 29
0
def test_agg_empty_child(es):
    customer_count_feat = ft.Feature(es['log']['id'],
                                     parent_entity=es['customers'],
                                     primitive=Count)
    feature_set = FeatureSet([customer_count_feat])

    # time last before the customer had any events, so child frame is empty
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])), index='id')

    assert df["COUNT(log)"].iloc[0] == 0
def test_make_agg_feat_of_identity_variable(es):
    agg_feat = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Sum)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 50)