def test_trend(es):
    trend = ft.Feature([es['log']['value'], es['log']['datetime']],
                       parent_entity=es['customers'],
                       primitive=Trend)
    feature_set = FeatureSet([trend])

    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run([0, 1, 2])

    true_results = [-0.812730, 4.870378, np.nan]

    np.testing.assert_almost_equal(df[trend.get_name()].values.tolist(),
                                   true_results,
                                   decimal=5)
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat,
                                   parent_entity=es['customers'],
                                   primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run([0])
    v = df[customer_sum_feat.get_name()][0]
    assert (v == 10)
def test_make_agg_feat_where_count(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          where=IdentityFeature(
                              es['log']['product_id']) == 'coke zero',
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 3)
Esempio n. 4
0
def test_make_3_stacked_agg_feats(df):
    """
    Tests stacking 3 agg features.

    The test specifically uses non numeric indices to test how ancestor variables are handled
    as dataframes are merged together

    """
    if isinstance(df, dd.DataFrame):
        pytest.xfail('normalize_entity fails with dask DataFrame')
    es = ft.EntitySet()
    vtypes = {
        'id': variable_types.Index,
        'e1': variable_types.Categorical,
        'e2': variable_types.Categorical,
        'e3': variable_types.Categorical,
        'val': variable_types.Numeric
    }
    es.entity_from_dataframe(dataframe=df,
                             index="id",
                             entity_id="e0",
                             variable_types=vtypes)

    es.normalize_entity(base_entity_id="e0",
                        new_entity_id="e1",
                        index="e1",
                        additional_variables=["e2", "e3"])

    es.normalize_entity(base_entity_id="e1",
                        new_entity_id="e2",
                        index="e2",
                        additional_variables=["e3"])

    es.normalize_entity(base_entity_id="e2", new_entity_id="e3", index="e3")

    sum_1 = ft.Feature(es["e0"]["val"], parent_entity=es["e1"], primitive=Sum)
    sum_2 = ft.Feature(sum_1, parent_entity=es["e2"], primitive=Sum)
    sum_3 = ft.Feature(sum_2, parent_entity=es["e3"], primitive=Sum)

    feature_set = FeatureSet([sum_3])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array(["z"]))
    v = df[sum_3.get_name()][0]
    assert (v == 5)
def test_precalculated_features(pd_es):
    error_msg = 'This primitive should never be used because the features are precalculated'

    class ErrorPrim(AggregationPrimitive):
        """A primitive whose function raises an error."""
        name = "error_prim"
        input_types = [Numeric]
        return_type = Numeric

        def get_function(self):
            def error(s):
                raise RuntimeError(error_msg)
            return error

    value = ft.Feature(pd_es['log']['value'])
    agg = ft.Feature(value,
                     parent_entity=pd_es['sessions'],
                     primitive=ErrorPrim)
    agg2 = ft.Feature(agg,
                      parent_entity=pd_es['customers'],
                      primitive=ErrorPrim)
    direct = ft.Feature(agg2, entity=pd_es['sessions'])

    # Set up a FeatureSet which knows which features are precalculated.
    precalculated_feature_trie = Trie(default=set, path_constructor=RelationshipPath)
    precalculated_feature_trie.get_node(direct.relationship_path).value.add(agg2.unique_name())
    feature_set = FeatureSet([direct], approximate_feature_trie=precalculated_feature_trie)

    # Fake precalculated data.
    values = [0, 1, 2]
    parent_fm = pd.DataFrame({agg2.get_name(): values})
    precalculated_fm_trie = Trie(path_constructor=RelationshipPath)
    precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm

    calculator = FeatureSetCalculator(pd_es,
                                      feature_set=feature_set,
                                      precalculated_features=precalculated_fm_trie)

    instance_ids = [0, 2, 3, 5]
    fm = calculator.run(np.array(instance_ids))

    assert list(fm[direct.get_name()]) == [values[0], values[0], values[1], values[2]]

    # Calculating without precalculated features should error.
    with pytest.raises(RuntimeError, match=error_msg):
        FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids)
def test_make_3_stacked_agg_feats():
    """
    Tests stacking 3 agg features.

    The test specifically uses non numeric indices to test how ancestor variables are handled
    as dataframes are merged together

    """
    df = pd.DataFrame({
        "id": ["a", "b", "c", "d", "e"],
        "e1": ["h", "h", "i", "i", "j"],
        "e2": ["x", "x", "y", "y", "x"],
        "e3": ["z", "z", "z", "z", "z"],
        "val": [1, 1, 1, 1, 1]
    })

    es = ft.EntitySet()
    es.entity_from_dataframe(dataframe=df,
                             index="id",
                             entity_id="e0")

    es.normalize_entity(base_entity_id="e0",
                        new_entity_id="e1",
                        index="e1",
                        additional_variables=["e2", "e3"])

    es.normalize_entity(base_entity_id="e1",
                        new_entity_id="e2",
                        index="e2",
                        additional_variables=["e3"])

    es.normalize_entity(base_entity_id="e2",
                        new_entity_id="e3",
                        index="e3")

    sum_1 = ft.Feature(es["e0"]["val"], parent_entity=es["e1"], primitive=Sum)
    sum_2 = ft.Feature(sum_1, parent_entity=es["e2"], primitive=Sum)
    sum_3 = ft.Feature(sum_2, parent_entity=es["e3"], primitive=Sum)

    feature_set = FeatureSet([sum_3])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array(["z"]))
    v = df[sum_3.get_name()][0]
    assert (v == 5)
Esempio n. 7
0
def test_direct_from_column(es):
    # should be same behavior as test_direct_from_identity
    device = Feature(es["sessions"].ww["device_type"])
    d = DirectFeature(base_feature=device, child_dataframe_name="log")

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index="id", sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.SPARK.value:
        expected = ["0", "1"]
    else:
        expected = [0, 1]
    assert v == expected
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es['log'].ww['id'],
                                parent_dataframe_name='sessions',
                                primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat,
                                   parent_dataframe_name='customers',
                                   primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[customer_sum_feat.get_name()].values[0]
    assert (v == 10)
def test_direct_from_column(es):
    # should be same behavior as test_direct_from_identity
    device = Feature(es['sessions'].ww['device_type'])
    d = DirectFeature(base_feature=device, child_dataframe_name='log')

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index='id', sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.KOALAS.value:
        expected = ['0', '1']
    else:
        expected = [0, 1]
    assert v == expected
def test_calls_progress_callback(es):
    # call with all feature types. make sure progress callback calls sum to 1
    identity = ft.Feature(es['customers'].ww['age'])
    direct = ft.Feature(es['cohorts'].ww['cohort_name'], 'customers')
    agg = ft.Feature(es['sessions'].ww['id'],
                     parent_dataframe_name='customers',
                     primitive=Count)
    agg_apply = ft.Feature(
        es['log'].ww['datetime'],
        parent_dataframe_name='customers',
        primitive=TimeSinceLast
    )  # this feature is handle differently than simple features
    trans = ft.Feature(agg, primitive=Negate)
    trans_full = ft.Feature(agg, primitive=CumSum)
    groupby_trans = ft.Feature(agg,
                               primitive=CumSum,
                               groupby=ft.Feature(
                                   es['customers'].ww['cohort']))

    if es.dataframe_type != Library.PANDAS.value:
        all_features = [identity, direct, agg, trans]
    else:
        all_features = [
            identity, direct, agg, agg_apply, trans, trans_full, groupby_trans
        ]

    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)

    class MockProgressCallback:
        def __init__(self):
            self.total = 0

        def __call__(self, update):
            self.total += update

    mock_progress_callback = MockProgressCallback()

    instance_ids = [0, 1, 2]
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)

    # testing again with a time_last with no data
    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=pd.Timestamp("1950"),
                                      feature_set=feature_set)

    mock_progress_callback = MockProgressCallback()
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)
Esempio n. 11
0
        def calc_results(time_last, ids, precalculated_features=None, training_window=None, include_cutoff_time=True):
            update_progress_callback = None

            if progress_bar is not None:
                def update_progress_callback(done):
                    previous_progress = progress_bar.n
                    progress_bar.update(done * group.shape[0])
                    if progress_callback is not None:
                        update, progress_percent, time_elapsed = update_progress_callback_parameters(progress_bar, previous_progress)
                        progress_callback(update, progress_percent, time_elapsed)

            calculator = FeatureSetCalculator(entityset,
                                              feature_set,
                                              time_last,
                                              training_window=training_window,
                                              precalculated_features=precalculated_features)
            matrix = calculator.run(ids, progress_callback=update_progress_callback, include_cutoff_time=include_cutoff_time)
            return matrix
def test_deep_agg_feat_chain(es):
    """
    Agg feat of agg feat:
        region.Mean(customer.Count(Log))
    """
    customer_count_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count)

    region_avg_feat = ft.Feature(customer_count_feat, parent_entity=es[u'régions'], primitive=Mean)

    feature_set = FeatureSet([region_avg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array(['United States']))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
    v = df[region_avg_feat.get_name()][0]
    assert (v == 17 / 3.)
Esempio n. 13
0
def test_calls_progress_callback(es):
    # call with all feature types. make sure progress callback calls sum to 1
    identity = ft.Feature(es['customers']['age'])
    direct = ft.Feature(es['cohorts']['cohort_name'], es['customers'])
    agg = ft.Feature(es["sessions"]["id"],
                     parent_entity=es['customers'],
                     primitive=Count)
    agg_apply = ft.Feature(
        es["log"]["datetime"],
        parent_entity=es['customers'],
        primitive=TimeSinceLast
    )  # this feature is handle differently than simple features
    trans = ft.Feature(agg, primitive=Negate)
    trans_full = ft.Feature(agg, primitive=CumSum)
    groupby_trans = ft.Feature(agg,
                               primitive=CumSum,
                               groupby=es["customers"]["cohort"])

    if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities):
        all_features = [identity, direct, agg, trans]
    else:
        all_features = [
            identity, direct, agg, agg_apply, trans, trans_full, groupby_trans
        ]

    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)

    class MockProgressCallback:
        def __init__(self):
            self.total = 0

        def __call__(self, update):
            self.total += update

    mock_progress_callback = MockProgressCallback()

    instance_ids = [0, 1, 2]
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)

    # testing again with a time_last with no data
    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=pd.Timestamp("1950"),
                                      feature_set=feature_set)

    mock_progress_callback = MockProgressCallback()
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)
Esempio n. 14
0
def test_two_kinds_of_dependents(es):
    v = ft.Feature(es['log']['value'])
    product = ft.Feature(es['log']['product_id'])
    agg = ft.Feature(v,
                     parent_entity=es['customers'],
                     where=product == 'coke zero',
                     primitive=Sum)
    p = ft.Feature(agg, primitive=Percentile)
    g = ft.Feature(agg, primitive=Absolute)
    agg2 = ft.Feature(v,
                      parent_entity=es['sessions'],
                      where=product == 'coke zero',
                      primitive=Sum)
    agg3 = ft.Feature(agg2, parent_entity=es['customers'], primitive=Sum)
    feature_set = FeatureSet([p, g, agg3])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))
    assert df[p.get_name()].tolist() == [2. / 3, 1.0]
    assert df[g.get_name()].tolist() == [15, 26]
def test_trend(pd_es):
    trend = ft.Feature([
        ft.Feature(pd_es['log'].ww['value']),
        ft.Feature(pd_es['log'].ww['datetime'])
    ],
                       parent_dataframe_name='customers',
                       primitive=Trend)
    feature_set = FeatureSet([trend])

    calculator = FeatureSetCalculator(pd_es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))

    true_results = [-0.812730, 4.870378, np.nan]

    np.testing.assert_almost_equal(df[trend.get_name()].tolist(),
                                   true_results,
                                   decimal=5)
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat,
                                   parent_entity=es['customers'],
                                   primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
        df.index = pd.Int64Index(df.index)
    v = df[customer_sum_feat.get_name()][0]
    assert (v == 10)
def test_diamond_entityset(diamond_es):
    es = diamond_es

    amount = ft.IdentityFeature(es['transactions']['amount'])
    path = backward_path(es, ['regions', 'customers', 'transactions'])
    through_customers = ft.AggregationFeature(amount, es['regions'],
                                              primitive=ft.primitives.Sum,
                                              relationship_path=path)
    path = backward_path(es, ['regions', 'stores', 'transactions'])
    through_stores = ft.AggregationFeature(amount, es['regions'],
                                           primitive=ft.primitives.Sum,
                                           relationship_path=path)

    feature_set = FeatureSet([through_customers, through_stores])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    assert (df['SUM(stores.transactions.amount)'] == [94, 261, 128]).all()
    assert (df['SUM(customers.transactions.amount)'] == [72, 411, 0]).all()
Esempio n. 18
0
def test_arithmetic_of_transform(es):
    diff1 = ft.Feature([es['log']['value']], primitive=Diff)
    diff2 = ft.Feature([es['log']['value_2']], primitive=Diff)

    to_test = [(AddNumeric, [np.nan, 7., -7., 10.]),
               (SubtractNumeric, [np.nan, 3., -3., 4.]),
               (MultiplyNumeric, [np.nan, 10., 10., 21.]),
               (DivideNumeric, [np.nan, 2.5, 2.5, 2.3333333333333335])]

    features = []
    for test in to_test:
        features.append(ft.Feature([diff1, diff2], primitive=test[0]()))

    feature_set = FeatureSet(features)
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run(np.array([0, 2, 12, 13]))
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert np.isnan(v.pop(0))
        assert np.isnan(test[1].pop(0))
        assert v == test[1]
        def calc_results(time_last,
                         ids,
                         precalculated_features=None,
                         training_window=None):

            progress_callback = None

            if progress_bar is not None:

                def progress_callback(done):
                    progress_bar.update(done * group.shape[0])

            calculator = FeatureSetCalculator(
                entityset,
                feature_set,
                time_last,
                training_window=training_window,
                precalculated_features=precalculated_features)

            matrix = calculator.run(ids, progress_callback=progress_callback)
            return matrix
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          use_previous=Timedelta(10, 's'),
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
Esempio n. 21
0
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(
        es["log"].ww["id"],
        parent_dataframe_name="sessions",
        use_previous=Timedelta(10, "s"),
        primitive=Count,
    )

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert v == 2

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert v == 1
def test_deep_agg_feat_chain(es):
    """
    Agg feat of agg feat:
        region.Mean(customer.Count(Log))
    """
    customer_count_feat = ft.Feature(es['log'].ww['id'],
                                     parent_dataframe_name='customers',
                                     primitive=Count)

    region_avg_feat = ft.Feature(customer_count_feat,
                                 parent_dataframe_name=u'régions',
                                 primitive=Mean)

    feature_set = FeatureSet([region_avg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array(['United States']))
    df = to_pandas(df, index='id')

    v = df[region_avg_feat.get_name()][0]
    assert (v == 17 / 3.)
def test_make_agg_feat_multiple_dtypes(es):
    compare_prod = IdentityFeature(es['log']['product_id']) == 'coke zero'

    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          where=compare_prod,
                          primitive=Count)

    agg_feat2 = ft.Feature(es['log']['product_id'],
                           parent_entity=es['sessions'],
                           where=compare_prod,
                           primitive=Mode)

    feature_set = FeatureSet([agg_feat, agg_feat2])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es['customers']['id'], parent_entity=es[u'régions'], primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat, child_entity=es['customers'])

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)
def test_make_agg_feat_where_count_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)

    feat = ft.Feature(es['sessions']['id'],
                      parent_entity=es['customers'],
                      where=log_count_feat > 1,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1]))
    name = feat.get_name()
    instances = df[name]
    v0, v1 = instances[0:2]
    assert (v0 == 2)
    assert (v1 == 2)
def test_topn(es):
    topn = ft.Feature(es['log']['product_id'],
                      parent_entity=es['customers'],
                      primitive=NMostCommon(n=2))
    feature_set = FeatureSet([topn])

    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run([0, 1, 2])

    true_results = pd.DataFrame(
        [['toothpaste', 'coke zero'],
         ['coke zero', 'Haribo sugar-free gummy bears'],
         ['taco clock', np.nan]])
    assert ([name in df.columns for name in topn.get_feature_names()])
    for i in range(df.shape[0]):
        if i == 0:
            # coke zero and toothpase have same number of occurrences
            assert set(true_results.loc[i].values) == set(df.loc[i].values)
        else:
            for i1, i2 in zip(true_results.loc[i], df.iloc[i]):
                assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
Esempio n. 27
0
def test_arithmetic_of_transform(es):
    if not all(isinstance(e.df, pd.DataFrame) for e in es.entities):
        pytest.xfail("Test uses Diff which is not supported in Dask or Koalas")
    diff1 = ft.Feature([es['log']['value']], primitive=Diff)
    diff2 = ft.Feature([es['log']['value_2']], primitive=Diff)

    to_test = [(AddNumeric, [np.nan, 7., -7., 10.]),
               (SubtractNumeric, [np.nan, 3., -3., 4.]),
               (MultiplyNumeric, [np.nan, 10., 10., 21.]),
               (DivideNumeric, [np.nan, 2.5, 2.5, 2.3333333333333335])]

    features = []
    for test in to_test:
        features.append(ft.Feature([diff1, diff2], primitive=test[0]()))

    feature_set = FeatureSet(features)
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run(np.array([0, 2, 12, 13]))
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert np.isnan(v.pop(0))
        assert np.isnan(test[1].pop(0))
        assert v == test[1]
Esempio n. 28
0
def test_diamond_entityset(diamond_es):
    es = diamond_es

    amount = ft.IdentityFeature(es["transactions"].ww["amount"])
    path = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(amount,
                                              "regions",
                                              primitive=ft.primitives.Sum,
                                              relationship_path=path)
    path = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(amount,
                                           "regions",
                                           primitive=ft.primitives.Sum,
                                           relationship_path=path)

    feature_set = FeatureSet([through_customers, through_stores])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    df = to_pandas(df, index="id", sort_index=True)

    assert (df["SUM(stores.transactions.amount)"] == [94, 261, 128]).all()
    assert (df["SUM(customers.transactions.amount)"] == [72, 411, 0]).all()
def test_make_agg_feat_where_count_or_device_type_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = ft.Feature(es['sessions']['id'],
                      parent_entity=es['customers'],
                      where=or_feat,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 3)
Esempio n. 30
0
def test_make_agg_feat_using_prev_n_events(es):
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail("Distrubuted entitysets do not support use_previous")
    agg_feat_1 = ft.Feature(
        es["log"].ww["value"],
        parent_dataframe_name="sessions",
        use_previous=Timedelta(1, "observations"),
        primitive=Min,
    )

    agg_feat_2 = ft.Feature(
        es["log"].ww["value"],
        parent_dataframe_name="sessions",
        use_previous=Timedelta(3, "observations"),
        primitive=Min,
    )

    assert (agg_feat_1.get_name() != agg_feat_2.get_name()
            ), "Features should have different names based on use_previous"

    feature_set = FeatureSet([agg_feat_1, agg_feat_2])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 6),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    # time_last is included by default
    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 5
    assert v2 == 0

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 20
    assert v2 == 10