def test_approximate_dfeat_of_need_all_values(entityset):
    es = entityset
    p = Percentile(es['log']['value'])
    agg_feat = Sum(p, es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    log_df = es['log'].df
    instances = [0, 2]
    cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')]
    approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')]
    true_vals = []
    true_vals_approx = []
    for instance, cutoff, approx in zip(instances, cutoffs, approxes):
        log_data_cutoff = log_df[log_df['datetime'] < cutoff]
        log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True)
        true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum()
        true_vals.append(round(true_agg, 3))

        log_data_approx = log_df[log_df['datetime'] < approx]
        log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True)
        true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum()
        true_vals_approx.append(round(true_agg_approx, 3))
    lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()]
    test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()]
    assert lapprox == true_vals_approx
    assert test_list == true_vals
Beispiel #2
0
def test_approximate_dfeat_of_need_all_values(entityset):
    es = entityset
    p = Percentile(es['log']['value'])
    agg_feat = Sum(p, es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])

    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              instance_ids=[0, 2],
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time_in_index=True,
                                              cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                                                           datetime(2011, 4, 9, 11, 0, 0)])
    log_df = es['log'].df
    instances = [0, 2]
    cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')]
    approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')]
    true_vals = []
    true_vals_approx = []
    for instance, cutoff, approx in zip(instances, cutoffs, approxes):
        log_data_cutoff = log_df[log_df['datetime'] < cutoff]
        log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True)
        true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum()
        true_vals.append(round(true_agg, 3))

        log_data_approx = log_df[log_df['datetime'] < approx]
        log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True)
        true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum()
        true_vals_approx.append(round(true_agg_approx, 3))
    lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()]
    test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()]
    assert lapprox == true_vals_approx
    assert test_list == true_vals
Beispiel #3
0
def test_copy_features_does_not_copy_entityset(es):
    agg = Sum(es['log']['value'], es['sessions'])
    agg_where = Sum(es['log']['value'], es['sessions'],
                    where=IdentityFeature(es['log']['value']) == 2)
    agg_use_previous = Sum(es['log']['value'], es['sessions'],
                           use_previous='4 days')
    agg_use_previous_where = Sum(es['log']['value'], es['sessions'],
                                 where=IdentityFeature(es['log']['value']) == 2,
                                 use_previous='4 days')
    features = [agg, agg_where, agg_use_previous, agg_use_previous_where]
    in_memory_size = asizeof(locals())
    copied = [f.copy() for f in features]
    new_in_memory_size = asizeof(locals())
    assert new_in_memory_size < 2 * in_memory_size

    for f, c in zip(features, copied):
        assert f.entityset
        assert c.entityset
        assert id(f.entityset) == id(c.entityset)
        if f.where:
            assert c.where
            assert id(f.where.entityset) == id(c.where.entityset)
        for bf, bf_c in zip(f.base_features, c.base_features):
            assert id(bf.entityset) == id(bf_c.entityset)
            if bf.where:
                assert bf_c.where
                assert id(bf.where.entityset) == id(bf_c.where.entityset)
def test_uses_full_entity_feat_of_approximate(entityset):
    es = entityset
    agg_feat = Sum(es['log']['value'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    agg_feat3 = Min(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    dfeat2 = DirectFeature(agg_feat3, es['sessions'])
    p = Percentile(dfeat)

    # only dfeat2 should be approximated
    # because Percentile needs all values

    feature_matrix_only_dfeat2 = calculate_feature_matrix(
        [dfeat2],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0]

    feature_matrix_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist(
    ) == feature_matrix_approx[dfeat2.get_name()].tolist()

    feature_matrix_small_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 'ms'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])

    feature_matrix_no_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    for f in [p, dfeat, agg_feat]:
        for fm1, fm2 in combinations([
                feature_matrix_approx, feature_matrix_small_approx,
                feature_matrix_no_approx
        ], 2):
            assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
Beispiel #5
0
def test_make_agg_feat_of_identity_variable(entityset, backend):
    agg_feat = Sum(entityset['log']['value'],
                   parent_entity=entityset['sessions'])

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[agg_feat.get_name()][0]
    assert (v == 50)
def test_make_agg_feat_of_identity_variable(entityset, backend):
    agg_feat = Sum(entityset['log']['value'],
                   parent_entity=entityset['sessions'])

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[agg_feat.get_name()][0]
    assert (v == 50)
Beispiel #7
0
def test_make_agg_feat_of_agg_feat(entityset, backend):
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['sessions'])

    customer_sum_feat = Sum(log_count_feat,
                            parent_entity=entityset['customers'])

    pandas_backend = backend([customer_sum_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[customer_sum_feat.get_name()][0]
    assert (v == 10)
def test_make_agg_feat_of_agg_feat(entityset, backend):
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['sessions'])

    customer_sum_feat = Sum(log_count_feat,
                            parent_entity=entityset['customers'])

    pandas_backend = backend([customer_sum_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[customer_sum_feat.get_name()][0]
    assert (v == 10)
Beispiel #9
0
def test_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['sessions'])
    pandas_backend = PandasBackend(es, [agg])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['sessions'])
    pandas_backend = PandasBackend(es, [agg])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_set_data_path(es):
    key = "primitive_data_folder"

    # Don't change orig_path
    orig_path = config.get(key)
    new_path = "/example/new/directory"
    filename = "test.csv"

    # Test that default path works
    sum_prim = Sum()
    assert sum_prim.get_filepath(filename) == os.path.join(orig_path, filename)

    # Test that new path works
    config.set({key: new_path})
    assert sum_prim.get_filepath(filename) == os.path.join(new_path, filename)

    # Test that new path with trailing / works
    new_path += "/"
    config.set({key: new_path})
    assert sum_prim.get_filepath(filename) == os.path.join(new_path, filename)

    # Test that the path is correct on newly defined feature
    sum_prim2 = Sum()
    assert sum_prim2.get_filepath(filename) == os.path.join(new_path, filename)

    # Ensure path was reset
    config.set({key: orig_path})
    assert config.get(key) == orig_path
Beispiel #12
0
def test_get_dependencies(es):
    f = Feature(es['log']['value'])
    agg1 = Sum(f, es['sessions'])
    agg2 = Sum(agg1, es['customers'])
    d1 = Feature(agg2, es['sessions'])
    shallow = d1.get_dependencies(deep=False, ignored=None)
    deep = d1.get_dependencies(deep=True, ignored=None)
    ignored = set([agg1.hash()])
    deep_ignored = d1.get_dependencies(deep=True, ignored=ignored)
    assert [s.hash() for s in shallow] == [agg2.hash()]
    assert [d.hash() for d in deep] == [agg2.hash(), agg1.hash(), f.hash()]
    assert [d.hash() for d in deep_ignored] == [agg2.hash()]
Beispiel #13
0
def test_string_time_values_in_cutoff_time(entityset):
    times = ['2011-04-09 10:31:27', '2011-04-09 10:30:18']
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 0]})
    agg_feature = Sum(entityset['log']['value'], entityset['customers'])

    with pytest.raises(TypeError):
        calculate_feature_matrix([agg_feature], entityset, cutoff_time=cutoff_time)
Beispiel #14
0
def test_to_dictionary_agg(es):
    primitive = Sum()
    actual = ft.Feature(
        es["customers"].ww["age"], primitive=primitive, parent_dataframe_name="cohorts"
    ).to_dictionary()

    expected = {
        "type": "AggregationFeature",
        "dependencies": ["customers: age"],
        "arguments": {
            "name": "SUM(customers.age)",
            "base_features": ["customers: age"],
            "relationship_path": [
                {
                    "parent_dataframe_name": "cohorts",
                    "child_dataframe_name": "customers",
                    "parent_column_name": "cohort",
                    "child_column_name": "cohort",
                }
            ],
            "primitive": primitive,
            "where": None,
            "use_previous": None,
        },
    }

    assert expected == actual
Beispiel #15
0
def test_to_dictionary_where(es):
    primitive = Sum()
    actual = ft.Feature(
        es["log"].ww["value"],
        parent_dataframe_name="sessions",
        where=ft.IdentityFeature(es["log"].ww["value"]) == 2,
        primitive=primitive,
    ).to_dictionary()

    expected = {
        "type": "AggregationFeature",
        "dependencies": ["log: value", "log: value = 2"],
        "arguments": {
            "name": "SUM(log.value WHERE value = 2)",
            "base_features": ["log: value"],
            "relationship_path": [
                {
                    "parent_dataframe_name": "sessions",
                    "child_dataframe_name": "log",
                    "parent_column_name": "id",
                    "child_column_name": "session_id",
                }
            ],
            "primitive": primitive,
            "where": "log: value = 2",
            "use_previous": None,
        },
    }

    assert expected == actual
Beispiel #16
0
def test_approximate_time_split_returns_the_same_result(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:07:30'),
                                       pd.Timestamp('2011-04-09 10:07:40')],
                              'instance_id': [0, 0]})

    feature_matrix_at_once = calculate_feature_matrix([dfeat, agg_feat],
                                                      entityset,
                                                      approximate=Timedelta(10, 's'),
                                                      cutoff_time=cutoff_df)
    divided_matrices = []
    separate_cutoff = [cutoff_df.iloc[0:1], cutoff_df.iloc[1:]]
    # Make sure indexes are different
    # Not that this step is unecessary and done to showcase the issue here
    separate_cutoff[0].index = [0]
    separate_cutoff[1].index = [1]
    for ct in separate_cutoff:
        fm = calculate_feature_matrix([dfeat, agg_feat],
                                      entityset,
                                      approximate=Timedelta(10, 's'),
                                      cutoff_time=ct)
        divided_matrices.append(fm)
    feature_matrix_from_split = pd.concat(divided_matrices)
    assert feature_matrix_from_split.shape == feature_matrix_at_once.shape
    for i1, i2 in zip(feature_matrix_at_once.index, feature_matrix_from_split.index):
        assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
    for c in feature_matrix_from_split:
        for i1, i2 in zip(feature_matrix_at_once[c], feature_matrix_from_split[c]):
            assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
Beispiel #17
0
def test_cfm_no_cutoff_time_index(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat4 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat4, es['sessions'])
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              instance_ids=[0, 2],
                                              cutoff_time_in_index=False,
                                              approximate=Timedelta(12, 's'),
                                              cutoff_time=[datetime(2013, 4, 9, 10, 31, 19),
                                                           datetime(2013, 4, 9, 11, 0, 0)])
    assert feature_matrix.index.name == 'id'
    assert feature_matrix.index.values.tolist() == [0, 2]
    assert feature_matrix[dfeat.get_name()].tolist() == [10, 10]
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
    feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat],
                                                entityset,
                                                instance_ids=[0, 2],
                                                cutoff_time_in_index=False,
                                                approximate=Timedelta(10, 's'),
                                                cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                                                             datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_2.index.name == 'id'
    assert feature_matrix_2.index.tolist() == [0, 2]
    assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10]
    assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_override_cmp(es):
    # P TODO:
    return
    count = Count(es['log']['value'], es['sessions'])
    _sum = Sum(es['log']['value'], es['sessions'])
    gt_lo = count > 1
    gt_other = count > _sum
    ge_lo = count >= 1
    ge_other = count >= _sum
    lt_hi = count < 10
    lt_other = count < _sum
    le_hi = count <= 10
    le_other = count <= _sum
    ne_lo = count != 1
    ne_other = count != _sum

    to_test = [[True, True, False], [False, False, True], [True, True, True],
               [False, False, True], [True, True, True], [True, True, False],
               [True, True, True], [True, True, False]]
    features = [
        gt_lo, gt_other, ge_lo, ge_other, lt_hi, lt_other, le_hi, le_other,
        ne_lo, ne_other
    ]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test
Beispiel #19
0
def test_two_kinds_of_dependents(es):
    v = Feature(es['log']['value'])
    product = Feature(es['log']['product_id'])
    agg = Sum(v, es['customers'], where=product == 'coke zero')
    p = Percentile(agg)
    g = Absolute(agg)
    agg2 = Sum(v, es['sessions'], where=product == 'coke zero')
    # Adding this feature in tests line 218 in pandas_backend
    # where we remove columns in result_frame that already exist
    # in the output entity_frames in preparation for pd.concat
    # In a prior version, this failed because we changed the result_frame
    # variable itself, rather than making a new variable _result_frame.
    # When len(output_frames) > 1, the second iteration won't have
    # all the necessary columns because they were removed in the first
    agg3 = Sum(agg2, es['customers'])
    pandas_backend = PandasBackend(es, [p, g, agg3])
    df = pandas_backend.calculate_all_features([0, 1], None)
    assert df[p.get_name()].tolist() == [0.5, 1.0]
    assert df[g.get_name()].tolist() == [15, 26]
def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['log'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]
Beispiel #21
0
def test_get_depth(es):
    log_id_feat = es['log']['id']
    customer_id_feat = es['customers']['id']
    count_logs = Count(log_id_feat, parent_entity=es['sessions'])
    sum_count_logs = Sum(count_logs, parent_entity=es['customers'])
    num_logs_greater_than_5 = sum_count_logs > 5
    count_customers = Count(customer_id_feat,
                            parent_entity=es[u'régions'],
                            where=num_logs_greater_than_5)
    num_customers_region = Feature(count_customers, es["customers"])

    depth = num_customers_region.get_depth()
    assert depth == 5
def test_get_dependencies(es):
    f = Feature(es['log']['value'])
    agg1 = Sum(f, es['sessions'])
    agg2 = Sum(agg1, es['customers'])
    d1 = Feature(agg2, es['sessions'])
    shallow = d1.get_dependencies(deep=False, ignored=None)
    deep = d1.get_dependencies(deep=True, ignored=None)
    ignored = set([agg1.hash()])
    deep_ignored = d1.get_dependencies(deep=True, ignored=ignored)
    assert [s.hash() for s in shallow] == [agg2.hash()]
    assert [d.hash() for d in deep] == [agg2.hash(), agg1.hash(), f.hash()]
    assert [d.hash() for d in deep_ignored] == [agg2.hash()]
def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """

    # test with normally defined functions
    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(custom_primitive, input_types=[Numeric],
                             return_type=Numeric, name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(custom_primitive, input_types=[Numeric],
                             return_type=Numeric, name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric],
                             return_type=Numeric, name="sum")
    Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric],
                             return_type=Numeric, name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_approximate_multiple_instances_per_cutoff_time(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(1, 'week'),
                                              cutoff_time=cutoff_time,
                                              chunk_size="cutoff time")
    assert feature_matrix.shape[0] == 2
    assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
Beispiel #25
0
def test_direct_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['customers'])
    d = Feature(agg, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2
    true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0)
    true_p = true_p[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_empty_path_approximate_partial(entityset):
    es = copy.deepcopy(entityset)
    es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2]
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              es,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[dfeat.get_name()].tolist()
    assert vals1[0] == 7
    assert np.isnan(vals1[1])
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_approx_base_feature_is_also_first_class_feature(entityset):
    es = entityset
    log_to_products = DirectFeature(es['products']['rating'], es['log'])
    # This should still be computed properly
    agg_feat = Min(log_to_products, es['sessions'])
    customer_agg_feat = Sum(agg_feat, es['customers'])
    # This is to be approximated
    sess_to_cust = DirectFeature(customer_agg_feat, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[sess_to_cust.get_name()].tolist()
    assert vals1 == [8.5, 7]
    vals2 = feature_matrix[agg_feat.get_name()].tolist()
    assert vals2 == [4, 1.5]
Beispiel #28
0
def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """

    # test with normally defined functions
    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(lambda x: x.sum(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")
    Max = make_agg_primitive(lambda x: x.max(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]