def test_make_agg_feat_multiple_dtypes(es):
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail(
            'Currently no Dask or Koalas compatible agg prims that use multiple dtypes'
        )
    compare_prod = IdentityFeature(es['log'].ww['product_id']) == 'coke zero'

    agg_feat = ft.Feature(es['log'].ww['id'],
                          parent_dataframe_name='sessions',
                          where=compare_prod,
                          primitive=Count)

    agg_feat2 = ft.Feature(es['log'].ww['product_id'],
                           parent_dataframe_name='sessions',
                           where=compare_prod,
                           primitive=Mode)

    feature_set = FeatureSet([agg_feat, agg_feat2])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
def test_make_agg_feat_where_count_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log'].ww['id'],
                                parent_dataframe_name='sessions',
                                primitive=Count)

    feat = ft.Feature(es['sessions'].ww['id'],
                      parent_dataframe_name='customers',
                      where=log_count_feat > 1,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1]))
    df = to_pandas(df, index='id', sort_index=True)

    name = feat.get_name()
    instances = df[name]
    v0, v1 = instances[0:2]
    assert (v0 == 2)
    assert (v1 == 2)
Ejemplo n.º 3
0
def test_feature_trie_with_needs_full_entity(diamond_es):
    es = diamond_es
    amount = ft.IdentityFeature(es['transactions']['amount'])

    path_through_customers = backward_path(
        es, ['regions', 'customers', 'transactions'])
    agg = ft.AggregationFeature(amount,
                                es['regions'],
                                primitive=ft.primitives.Mean,
                                relationship_path=path_through_customers)
    trans_of_agg = ft.TransformFeature(agg, ft.primitives.CumSum)

    path_through_stores = backward_path(es,
                                        ['regions', 'stores', 'transactions'])
    trans = ft.TransformFeature(amount, ft.primitives.CumSum)
    agg_of_trans = ft.AggregationFeature(trans,
                                         es['regions'],
                                         primitive=ft.primitives.Mean,
                                         relationship_path=path_through_stores)

    features = [agg, trans_of_agg, agg_of_trans]
    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (True, {agg.unique_name(), trans_of_agg.unique_name()}, {agg_of_trans.unique_name()})
    assert trie.get_node(path_through_customers).value == \
        (True, {amount.unique_name()}, set())
    assert trie.get_node(path_through_customers[:1]).value == (True, set(),
                                                               set())
    assert trie.get_node(path_through_stores).value == \
        (True, {amount.unique_name(), trans.unique_name()}, set())
    assert trie.get_node(path_through_stores[:1]).value == (False, set(),
                                                            set())
Ejemplo n.º 4
0
def test_diff(es):
    value = ft.Feature(es['log']['value'])
    customer_id_feat = ft.Feature(es['sessions']['customer_id'],
                                  entity=es['log'])
    diff1 = ft.Feature(value, groupby=es['log']['session_id'], primitive=Diff)
    diff2 = ft.Feature(value, groupby=customer_id_feat, primitive=Diff)

    feature_set = FeatureSet([diff1, diff2])
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run(np.array(range(15)))

    val1 = df[diff1.get_name()].values.tolist()
    val2 = df[diff2.get_name()].values.tolist()
    correct_vals1 = [
        np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7
    ]
    correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7]
    for i, v in enumerate(val1):
        v1 = val1[i]
        if np.isnan(v1):
            assert (np.isnan(correct_vals1[i]))
        else:
            assert v1 == correct_vals1[i]
        v2 = val2[i]
        if np.isnan(v2):
            assert (np.isnan(correct_vals2[i]))
        else:
            assert v2 == correct_vals2[i]
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(es):
    """
    The graph looks like this (higher implies parent):

          C     C = Customers, the dataframe we're trying to predict on
          |     S = Sessions, a child of Customers
      P   S     L = Log, a child of both Sessions and Log
       \\ /     P = Products, a parent of Log which is not a descendent of customers
        L

    We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and
    then aggregate it with another agg_feat of C on L.
    """
    log_count_feat = ft.Feature(es['log'].ww['id'],
                                parent_dataframe_name='products',
                                primitive=Count)

    product_purchases_feat = DirectFeature(log_count_feat,
                                           child_dataframe_name='log')

    purchase_popularity = ft.Feature(product_purchases_feat,
                                     parent_dataframe_name='customers',
                                     primitive=Mean)

    feature_set = FeatureSet([purchase_popularity])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[purchase_popularity.get_name()].values[0]
    assert (v == 38.0 / 10.0)
def test_two_relationships_to_single_dataframe(games_es):
    es = games_es
    home_team, away_team = es.relationships
    path = RelationshipPath([(False, home_team)])
    mean_at_home = ft.AggregationFeature(ft.Feature(
        es['games'].ww['home_team_score']),
                                         'teams',
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    path = RelationshipPath([(False, away_team)])
    mean_at_away = ft.AggregationFeature(ft.Feature(
        es['games'].ww['away_team_score']),
                                         'teams',
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    home_team_mean = ft.DirectFeature(mean_at_home,
                                      'games',
                                      relationship=home_team)
    away_team_mean = ft.DirectFeature(mean_at_away,
                                      'games',
                                      relationship=away_team)

    feature_set = FeatureSet([home_team_mean, away_team_mean])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 8, 28),
                                      feature_set=feature_set)
    df = calculator.run(np.array(range(3)))
    df = to_pandas(df, index='id', sort_index=True)

    assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all()
    assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all()
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(es):
    """
    The graph looks like this (higher implies parent):

          C     C = Customers, the entity we're trying to predict on
          |     S = Sessions, a child of Customers
      P   S     L = Log, a child of both Sessions and Log
       \\ /     P = Products, a parent of Log which is not a descendent of customers
        L

    We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and
    then aggregate it with another agg_feat of C on L.
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['products'],
                                primitive=Count)

    product_purchases_feat = DirectFeature(log_count_feat,
                                           child_entity=es['log'])

    purchase_popularity = ft.Feature(product_purchases_feat,
                                     parent_entity=es['customers'],
                                     primitive=Mean)

    feature_set = FeatureSet([purchase_popularity])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
        df.index = pd.Int64Index(df.index)
    v = df[purchase_popularity.get_name()][0]
    assert (v == 38.0 / 10.0)
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the dataframe we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es['customers'].ww['id'],
                                     parent_dataframe_name=u'régions',
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_dataframe_name='customers')

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[num_customers_feat.get_name()].values[0]
    assert (v == 3)
def test_make_agg_feat_where_count_or_device_type_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = ft.Feature(es['sessions']['id'],
                      parent_entity=es['customers'],
                      where=or_feat,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
        df.index = pd.Int64Index(df.index)
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 3)
def test_make_dfeat_of_agg_feat_through_parent(es):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(es['stores']['id'])

    store_count_feat = ft.Feature(store_id_feat,
                                  parent_entity=es[u'régions'],
                                  primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=es['customers'])

    feature_set = FeatureSet([num_stores_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
        df.index = pd.Int64Index(df.index)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)
def test_make_compare_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    mean_agg_feat = ft.Feature(log_count_feat,
                               parent_entity=es['customers'],
                               primitive=Mean)

    mean_feat = DirectFeature(mean_agg_feat, child_entity=es['sessions'])

    feat = log_count_feat > mean_feat

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    name = feat.get_name()
    instances = df[name]
    v0, v1, v2 = instances[0:3]
    assert v0
    assert v1
    assert not v2
def test_make_agg_feat_where_count_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    feat = ft.Feature(es['sessions']['id'],
                      parent_entity=es['customers'],
                      where=log_count_feat > 1,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    name = feat.get_name()
    instances = df[name]
    v0, v1 = instances[0:2]
    assert (v0 == 2)
    assert (v1 == 2)
def test_make_agg_feat_multiple_dtypes(es):
    if any(isinstance(entity.df, dd.DataFrame) for entity in es.entities):
        pytest.xfail(
            'Currently no dask compatible agg prims that use multiple dtypes')
    compare_prod = IdentityFeature(es['log']['product_id']) == 'coke zero'

    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          where=compare_prod,
                          primitive=Count)

    agg_feat2 = ft.Feature(es['log']['product_id'],
                           parent_entity=es['sessions'],
                           where=compare_prod,
                           primitive=Mode)

    feature_set = FeatureSet([agg_feat, agg_feat2])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          use_previous=Timedelta(10, 's'),
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
def test_two_relationships_to_single_entity(games_es):
    es = games_es
    home_team, away_team = es.relationships
    path = RelationshipPath([(False, home_team)])
    mean_at_home = ft.AggregationFeature(es['games']['home_team_score'],
                                         es['teams'],
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    path = RelationshipPath([(False, away_team)])
    mean_at_away = ft.AggregationFeature(es['games']['away_team_score'],
                                         es['teams'],
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    home_team_mean = ft.DirectFeature(mean_at_home,
                                      es['games'],
                                      relationship=home_team)
    away_team_mean = ft.DirectFeature(mean_at_away,
                                      es['games'],
                                      relationship=away_team)

    feature_set = FeatureSet([home_team_mean, away_team_mean])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 8, 28),
                                      feature_set=feature_set)
    df = calculator.run(np.array(range(3)))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all()
    assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all()
def test_make_agg_feat_where_count_or_device_type_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log'].ww['id'],
                                parent_dataframe_name='sessions',
                                primitive=Count)

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(
        es['sessions'].ww['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = ft.Feature(es['sessions'].ww['id'],
                      parent_dataframe_name='customers',
                      where=or_feat,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id', int_index=True)

    name = feat.get_name()
    instances = df[name]
    assert (instances.values[0] == 3)
Ejemplo n.º 17
0
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(
        es["log"].ww["id"],
        parent_dataframe_name="sessions",
        use_previous=Timedelta(10, "s"),
        primitive=Count,
    )

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert v == 2

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert v == 1
def test_make_dfeat_of_agg_feat_through_parent(es):
    """
    The graph looks like this:

        R       C = Customers, the dataframe we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(es['stores'].ww['id'])

    store_count_feat = ft.Feature(store_id_feat,
                                  parent_dataframe_name=u'régions',
                                  primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_dataframe_name='customers')

    feature_set = FeatureSet([num_stores_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[num_stores_feat.get_name()].values[0]
    assert (v == 3)
Ejemplo n.º 19
0
def test_make_agg_feat_multiple_dtypes(es):
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail(
            "Currently no Dask or Spark compatible agg prims that use multiple dtypes"
        )
    compare_prod = IdentityFeature(es["log"].ww["product_id"]) == "coke zero"

    agg_feat = ft.Feature(
        es["log"].ww["id"],
        parent_dataframe_name="sessions",
        where=compare_prod,
        primitive=Count,
    )

    agg_feat2 = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="sessions",
        where=compare_prod,
        primitive=Mode,
    )

    feature_set = FeatureSet([agg_feat, agg_feat2])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert v == 3
    assert v2 == "coke zero"
def test_topn(pd_es):
    topn = ft.Feature(pd_es['log'].ww['product_id'],
                      parent_dataframe_name='customers',
                      primitive=NMostCommon(n=2))
    feature_set = FeatureSet([topn])

    calculator = FeatureSetCalculator(pd_es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    true_results = pd.DataFrame(
        [['toothpaste', 'coke zero'],
         ['coke zero', 'Haribo sugar-free gummy bears'],
         ['taco clock', np.nan]])
    assert ([name in df.columns for name in topn.get_feature_names()])

    for i in range(df.shape[0]):
        true = true_results.loc[i]
        actual = df.loc[i]
        if i == 0:
            # coke zero and toothpase have same number of occurrences
            assert set(true.values) == set(actual.values)
        else:
            for i1, i2 in zip(true, actual):
                assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
Ejemplo n.º 21
0
def test_make_agg_feat_where_count_and_device_type_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es["log"].ww["id"],
                                parent_dataframe_name="sessions",
                                primitive=Count)

    compare_count = log_count_feat == 1
    compare_device_type = IdentityFeature(
        es["sessions"].ww["device_type"]) == 1
    and_feat = ft.Feature([compare_count, compare_device_type], primitive=And)
    feat = ft.Feature(
        es["sessions"].ww["id"],
        parent_dataframe_name="customers",
        where=and_feat,
        primitive=Count,
    )

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index="id")

    name = feat.get_name()
    instances = df[name]
    assert instances.values[0] == 1
Ejemplo n.º 22
0
def test_feature_trie_ignores_approximate_features(es):
    value = ft.IdentityFeature(es['log']['value'], )
    agg = ft.AggregationFeature(value,
                                es['sessions'],
                                primitive=ft.primitives.Mean)
    agg_of_agg = ft.AggregationFeature(agg,
                                       es['customers'],
                                       primitive=ft.primitives.Sum)
    direct = ft.DirectFeature(agg_of_agg, es['sessions'])
    features = [direct, agg]

    approximate_feature_trie = Trie(default=list,
                                    path_constructor=RelationshipPath)
    approximate_feature_trie.get_node(
        direct.relationship_path).value = [agg_of_agg]
    feature_set = FeatureSet(features,
                             approximate_feature_trie=approximate_feature_trie)
    trie = feature_set.feature_trie

    # Since agg_of_agg is ignored it and its dependencies should not be in the
    # trie.
    sub_trie = trie.get_node(direct.relationship_path)
    for _path, (_, _, features) in sub_trie:
        assert not features

    assert trie.value == (False, set(),
                          {direct.unique_name(),
                           agg.unique_name()})
    assert trie.get_node(agg.relationship_path).value == \
        (False, set(), {value.unique_name()})
def test_make_agg_feat_where_count_and_device_type_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    compare_count = log_count_feat == 1
    compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1
    and_feat = ft.Feature([compare_count, compare_device_type], primitive=And)
    feat = ft.Feature(es['sessions']['id'],
                      parent_entity=es['customers'],
                      where=and_feat,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 1)
Ejemplo n.º 24
0
def test_feature_trie_with_needs_full_entity_direct(es):
    value = ft.IdentityFeature(es['log']['value'], )
    agg = ft.AggregationFeature(value,
                                es['sessions'],
                                primitive=ft.primitives.Mean)
    agg_of_agg = ft.AggregationFeature(agg,
                                       es['customers'],
                                       primitive=ft.primitives.Sum)
    direct = ft.DirectFeature(agg_of_agg, es['sessions'])
    trans = ft.TransformFeature(direct, ft.primitives.CumSum)

    features = [trans, agg]
    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (True, {direct.unique_name(), trans.unique_name()}, {agg.unique_name()})

    assert trie.get_node(agg.relationship_path).value == \
        (False, set(), {value.unique_name()})

    parent_node = trie.get_node(direct.relationship_path)
    assert parent_node.value == (True, {agg_of_agg.unique_name()}, set())

    child_through_parent_node = parent_node.get_node(
        agg_of_agg.relationship_path)
    assert child_through_parent_node.value == (True, {agg.unique_name()},
                                               set())

    assert child_through_parent_node.get_node(agg.relationship_path).value == \
        (True, {value.unique_name()}, set())
def test_make_compare_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log'].ww['id'],
                                parent_dataframe_name='sessions',
                                primitive=Count)

    mean_agg_feat = ft.Feature(log_count_feat,
                               parent_dataframe_name='customers',
                               primitive=Mean)

    mean_feat = DirectFeature(mean_agg_feat, child_dataframe_name='sessions')

    feat = log_count_feat > mean_feat

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    df = to_pandas(df, index='id', sort_index=True)

    name = feat.get_name()
    instances = df[name]
    v0, v1, v2 = instances[0:3]
    assert v0
    assert v1
    assert not v2
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es['customers']['id'],
                                     parent_entity=es[u'régions'],
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_entity=es['customers'])

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run([0])
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)
def test_calls_progress_callback(es):
    # call with all feature types. make sure progress callback calls sum to 1
    identity = ft.Feature(es['customers']['age'])
    direct = ft.Feature(es['cohorts']['cohort_name'], es['customers'])
    agg = ft.Feature(es["sessions"]["id"],
                     parent_entity=es['customers'],
                     primitive=Count)
    agg_apply = ft.Feature(
        es["log"]["datetime"],
        parent_entity=es['customers'],
        primitive=TimeSinceLast
    )  # this feature is handle differently than simple features
    trans = ft.Feature(agg, primitive=CumSum)
    groupby_trans = ft.Feature(agg,
                               primitive=CumSum,
                               groupby=es["customers"]["cohort"])
    all_features = [identity, direct, agg, agg_apply, trans, groupby_trans]

    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)

    class MockProgressCallback:
        def __init__(self):
            self.total = 0

        def __call__(self, update):
            self.total += update

    mock_progress_callback = MockProgressCallback()

    instance_ids = [0, 1, 2]
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)

    # testing again with a time_last with no data
    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=pd.Timestamp("1950"),
                                      feature_set=feature_set)

    mock_progress_callback = MockProgressCallback()
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)
Ejemplo n.º 28
0
def test_make_trans_feat(es):
    f = ft.Feature(es['log']['datetime'], primitive=Hour)

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[f.get_name()][0]
    assert v == 10
Ejemplo n.º 29
0
    def check(feature):
        feature_set = FeatureSet([feature])
        calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None)
        df_1 = calculator.run(np.array([0, 1, 2]))
        df_2 = calculator.run(np.array([2, 4]))

        # check that the value for instance id 2 matches
        assert (df_2.loc[2] == df_1.loc[2]).all()
Ejemplo n.º 30
0
def test_percentile_with_cutoff(es):
    v = ft.Feature(es['log']['value'])
    p = ft.Feature(v, primitive=Percentile)
    feature_set = FeatureSet([p])
    calculator = FeatureSetCalculator(es, feature_set,
                                      pd.Timestamp('2011/04/09 10:30:13'))
    df = calculator.run(np.array([2]))
    assert df[p.get_name()].tolist()[0] == 1.0