Ejemplo n.º 1
0
def test_diff_single_value(es):
    diff = ft.Feature(es['stores']['num_square_feet'],
                      groupby=es['stores'][u'région_id'],
                      primitive=Diff)
    feature_set = FeatureSet([diff])
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run([5])
    assert df.shape[0] == 1
    assert df[diff.get_name()].dropna().shape[0] == 0
def test_calls_progress_callback(es):
    # call with all feature types. make sure progress callback calls sum to 1
    identity = ft.Feature(es['customers']['age'])
    direct = ft.Feature(es['cohorts']['cohort_name'], es['customers'])
    agg = ft.Feature(es["sessions"]["id"], parent_entity=es['customers'], primitive=Count)
    agg_apply = ft.Feature(es["log"]["datetime"], parent_entity=es['customers'], primitive=TimeSinceLast)  # this feature is handle differently than simple features
    trans = ft.Feature(agg, primitive=Negate)
    trans_full = ft.Feature(agg, primitive=CumSum)
    groupby_trans = ft.Feature(agg, primitive=CumSum, groupby=es["customers"]["cohort"])

    if any(isinstance(entity.df, dd.DataFrame) for entity in es.entities):
        all_features = [identity, direct, agg, trans]
    else:
        all_features = [identity, direct, agg, agg_apply, trans, trans_full, groupby_trans]

    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)

    class MockProgressCallback:
        def __init__(self):
            self.total = 0

        def __call__(self, update):
            self.total += update

    mock_progress_callback = MockProgressCallback()

    instance_ids = [0, 1, 2]
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)

    # testing again with a time_last with no data
    feature_set = FeatureSet(all_features)
    calculator = FeatureSetCalculator(es,
                                      time_last=pd.Timestamp("1950"),
                                      feature_set=feature_set)

    mock_progress_callback = MockProgressCallback()
    calculator.run(np.array(instance_ids), mock_progress_callback)

    assert np.isclose(mock_progress_callback.total, 1)
Ejemplo n.º 3
0
def test_percentile(es):
    v = ft.Feature(es['log']['value'])
    p = ft.Feature(v, primitive=Percentile)
    feature_set = FeatureSet([p])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array(range(10, 17)))
    true = es['log'].df[v.get_name()].rank(pct=True)
    true = true.loc[range(10, 17)]
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
Ejemplo n.º 4
0
def test_direct_squared(es):
    feature = IdentityFeature(es['log']['value'])
    squared = feature * feature
    feature_set = FeatureSet([feature, squared])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0, 1, 2])))
    for i, row in df.iterrows():
        assert (row[0] * row[0]) == row[1]
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index='id', sort_index=True)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_make_agg_feat_of_grandchild_entity(es):
    agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
def test_make_identity(es):
    f = IdentityFeature(es['log']['datetime'])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[f.get_name()][0]
    assert (v == datetime(2011, 4, 9, 10, 30, 0))
Ejemplo n.º 8
0
def test_diff_reordered(es):
    sum_feat = ft.Feature(es['log']['value'],
                          parent_entity=es["sessions"],
                          primitive=Sum)
    diff = ft.Feature(sum_feat, primitive=Diff)
    feature_set = FeatureSet([diff])
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run(np.array([4, 2]))
    assert df[diff.get_name()][4] == 16
    assert df[diff.get_name()][2] == -6
def test_make_agg_feat_of_identity_index_variable(es):
    agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[agg_feat.get_name()][0]
    assert (v == 5)
Ejemplo n.º 10
0
    def check(feature):
        feature_set = FeatureSet([feature])
        calculator = FeatureSetCalculator(es,
                                          feature_set=feature_set,
                                          time_last=None)
        df_1 = calculator.run(np.array([0, 1, 2]))
        df_2 = calculator.run(np.array([2, 4]))

        # check that the value for instance id 2 matches
        assert (df_2.loc[2] == df_1.loc[2]).all()
def test_make_dfeat(es):
    f = DirectFeature(es['customers']['age'], child_entity=es['sessions'])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[f.get_name()][0]
    assert (v == 33)
Ejemplo n.º 12
0
def test_make_trans_feat(es):
    f = ft.Feature(es['log']['datetime'], primitive=Hour)

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es, feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    v = df[f.get_name()][0]
    assert v == 10
def test_with_features_built_from_es_metadata(es):
    metadata = es.metadata
    agg_feat = ft.Feature(metadata['log']['id'], parent_entity=metadata['customers'], primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
Ejemplo n.º 14
0
def test_make_identity(es):
    f = IdentityFeature(es["log"].ww["datetime"])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert v == datetime(2011, 4, 9, 10, 30, 0)
Ejemplo n.º 15
0
def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device,
                      child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None)
    df = calculator.run([0, 5])
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_agg_empty_child(es):
    customer_count_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count)
    feature_set = FeatureSet([customer_count_feat])

    # time last before the customer had any events, so child frame is empty
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    assert df["COUNT(log)"].iloc[0] == 0
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[customer_sum_feat.get_name()][0]
    assert (v == 10)
def test_make_dfeat(es):
    f = DirectFeature(ft.Feature(es['customers'].ww['age']),
                      child_dataframe_name='sessions')

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert (v == 33)
Ejemplo n.º 19
0
def test_agg_percentile(es):
    v = ft.Feature(es['log']['value'])
    p = ft.Feature(v, primitive=Percentile)
    agg = ft.Feature(p, parent_entity=es['sessions'], primitive=Sum)
    feature_set = FeatureSet([agg])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))
    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_squared(es):
    feature = IdentityFeature(es['log']['value'])
    squared = feature * feature
    feature_set = FeatureSet([feature, squared])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()
    for i, row in df.iterrows():
        assert (row[0] * row[0]) == row[1]
Ejemplo n.º 21
0
def test_make_3_stacked_agg_feats(df):
    """
    Tests stacking 3 agg features.

    The test specifically uses non numeric indices to test how ancestor columns are handled
    as dataframes are merged together

    """
    if isinstance(df, dd.DataFrame):
        pytest.xfail("normalize_datdataframe fails with dask DataFrame")
    es = ft.EntitySet()
    ltypes = {
        "e1": Categorical,
        "e2": Categorical,
        "e3": Categorical,
        "val": Double
    }
    es.add_dataframe(dataframe=df,
                     index="id",
                     dataframe_name="e0",
                     logical_types=ltypes)

    es.normalize_dataframe(
        base_dataframe_name="e0",
        new_dataframe_name="e1",
        index="e1",
        additional_columns=["e2", "e3"],
    )

    es.normalize_dataframe(
        base_dataframe_name="e1",
        new_dataframe_name="e2",
        index="e2",
        additional_columns=["e3"],
    )

    es.normalize_dataframe(base_dataframe_name="e2",
                           new_dataframe_name="e3",
                           index="e3")

    sum_1 = ft.Feature(es["e0"].ww["val"],
                       parent_dataframe_name="e1",
                       primitive=Sum)
    sum_2 = ft.Feature(sum_1, parent_dataframe_name="e2", primitive=Sum)
    sum_3 = ft.Feature(sum_2, parent_dataframe_name="e3", primitive=Sum)

    feature_set = FeatureSet([sum_3])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array(["z"]))
    v = df[sum_3.get_name()][0]
    assert v == 5
Ejemplo n.º 22
0
def test_returns_order_of_instance_ids(pd_es):
    feature_set = FeatureSet([ft.Feature(pd_es['customers']['age'])])
    calculator = FeatureSetCalculator(pd_es,
                                      time_last=None,
                                      feature_set=feature_set)

    instance_ids = [0, 1, 2]
    assert list(pd_es['customers'].df['id']) != instance_ids

    df = calculator.run(np.array(instance_ids))

    assert list(df.index) == instance_ids
Ejemplo n.º 23
0
def test_feature_trie_without_needs_full_entity(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es['countries']['name'])
    direct_name = ft.DirectFeature(country_name, es['regions'])
    amount = ft.IdentityFeature(es['transactions']['amount'])

    path_through_customers = backward_path(
        es, ['regions', 'customers', 'transactions'])
    through_customers = ft.AggregationFeature(
        amount,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers)
    path_through_stores = backward_path(es,
                                        ['regions', 'stores', 'transactions'])
    through_stores = ft.AggregationFeature(
        amount,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores)
    customers_to_transactions = backward_path(es,
                                              ['customers', 'transactions'])
    customers_mean = ft.AggregationFeature(
        amount,
        es['customers'],
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions)

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ['regions', 'customers'])
    mean_of_mean = ft.AggregationFeature(
        negation,
        es['regions'],
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers)

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == \
        (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == \
        (False, set(), {country_name.unique_name()})
    assert trie.get_node(regions_to_customers).value == \
        (False, set(), {negation.unique_name(), customers_mean.unique_name()})
    regions_to_stores = backward_path(es, ['regions', 'stores'])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == \
        (False, set(), {amount.unique_name()})
    assert trie.get_node(path_through_stores).value == \
        (False, set(), {amount.unique_name()})
Ejemplo n.º 24
0
def test_direct_percentile(es):
    v = ft.Feature(es['customers']['age'])
    p = ft.Feature(v, primitive=Percentile)
    d = ft.Feature(p, es['sessions'])
    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))

    cust_vals = es['customers'].df[[v.get_name()]]
    cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True)
    true_p = cust_vals['percentile'].loc[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_make_agg_feat_of_identity_column(es):
    agg_feat = ft.Feature(es['log'].ww['value'],
                          parent_dataframe_name='sessions',
                          primitive=Sum)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 50)
def test_full_entity_error_dask(dask_es):
    agg_feat = ft.Feature(dask_es['log']['value'], parent_entity=dask_es['customers'],
                          primitive=Sum)
    trans_feat = ft.Feature(agg_feat, primitive=CumSum)

    feature_set = FeatureSet([trans_feat])
    calculator = FeatureSetCalculator(dask_es,
                                      time_last=None,
                                      feature_set=feature_set)
    error_text = "Cannot use primitives that require full entity with Dask"

    with pytest.raises(ValueError, match=error_text):
        calculator.run(np.array([1]))
def test_make_agg_feat_of_grandchild_entity(es):
    agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id')
        df.index = pd.Int64Index(df.index)
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
def test_make_agg_feat_of_identity_variable(es):
    agg_feat = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Sum)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    v = df[agg_feat.get_name()][0]
    assert (v == 50)
def test_full_entity_trans_of_agg(es):
    agg_feat = ft.Feature(es['log']['value'], parent_entity=es['customers'],
                          primitive=Sum)
    trans_feat = ft.Feature(agg_feat, primitive=CumSum)

    feature_set = FeatureSet([trans_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([1]))

    v = df[trans_feat.get_name()][1]
    assert v == 82
Ejemplo n.º 30
0
def test_two_kinds_of_dependents(es):
    v = ft.Feature(es['log']['value'])
    product = ft.Feature(es['log']['product_id'])
    agg = ft.Feature(v, parent_entity=es['customers'], where=product == 'coke zero', primitive=Sum)
    p = ft.Feature(agg, primitive=Percentile)
    g = ft.Feature(agg, primitive=Absolute)
    agg2 = ft.Feature(v, parent_entity=es['sessions'], where=product == 'coke zero', primitive=Sum)
    agg3 = ft.Feature(agg2, parent_entity=es['customers'], primitive=Sum)
    feature_set = FeatureSet([p, g, agg3])
    calculator = FeatureSetCalculator(es, feature_set)
    df = calculator.run(np.array([0, 1]))
    assert df[p.get_name()].tolist() == [2. / 3, 1.0]
    assert df[g.get_name()].tolist() == [15, 26]