def test_empty_child_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({"id": [1, 2, 3],
                             "parent_id": [1, 1, 1],
                             "time_index": pd.date_range(start='1/1/2018', periods=3),
                             "value": [10, 5, 2]})

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id")
    es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index")
    es.add_relationship(ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = Count(es["child"]['id'], es["parent"])

    # create agg feature that requires multiple arguments
    trend = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"])

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = Count(es["child"]['id'], es["parent"], where=where)
    trend_where = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"], where=where)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017"))
    names = [count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name()]
    assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]])

    # cutoff time after all rows, but where clause filters all rows
    fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018"))
    names = [count_where.get_name(), trend_where.get_name()]
    assert_array_equal(fm2[names], [[0, np.nan]])
def test_training_window(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])

    # make sure features that have a direct to a higher level agg
    # so we have multiple "filter eids" in get_pandas_data_slice,
    # and we go through the loop to pull data with a training_window param more than once
    dagg = DirectFeature(top_level_agg, entityset['customers'])

    # for now, warns if last_time_index not present
    times = [datetime(2011, 4, 9, 12, 31),
             datetime(2011, 4, 10, 11),
             datetime(2011, 4, 10, 13, 10, 1)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]})
    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')

    entityset.add_last_time_indexes()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset,
                                                  cutoff_time=cutoff_time,
                                                  training_window=Timedelta(2, 'observations', entity='log'))

    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')
    prop_values = [5, 5, 1]
    dagg_values = [3, 2, 1]
    assert (feature_matrix[property_feature.get_name()] == prop_values).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_cfm_no_cutoff_time_index(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat4 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat4, es['sessions'])
    cutoff_time = pd.DataFrame({
        'time': [datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)],
        'instance_id': [0, 2]
    })
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              cutoff_time_in_index=False,
                                              approximate=Timedelta(12, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix.index.name == 'id'
    assert feature_matrix.index.values.tolist() == [0, 2]
    assert feature_matrix[dfeat.get_name()].tolist() == [10, 10]
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

    cutoff_time = pd.DataFrame({
        'time': [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)],
        'instance_id': [0, 2]
    })
    feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat],
                                                entityset,
                                                cutoff_time_in_index=False,
                                                approximate=Timedelta(10, 's'),
                                                cutoff_time=cutoff_time)
    assert feature_matrix_2.index.name == 'id'
    assert feature_matrix_2.index.tolist() == [0, 2]
    assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10]
    assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_make_agg_feat_of_grandchild_entity(entityset, backend):
    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['customers'])

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[agg_feat.get_name()][0]
    assert (v == 10)
def test_make_agg_feat_of_identity_index_variable(entityset, backend):
    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['sessions'])

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[agg_feat.get_name()][0]
    assert (v == 5)
def test_make_agg_feat_where_count(entityset, backend):
    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['sessions'],
                     where=IdentityFeature(entityset['log']['product_id']) == 'coke zero')

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)

    v = df[agg_feat.get_name()][0]
    assert (v == 3)
def test_approximate_dfeat_of_agg_on_target(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])

    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              instance_ids=[0, 2],
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                                                           datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_approximate_multiple_instances_per_cutoff_time(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(1, 'week'),
                                              cutoff_time=cutoff_time,
                                              chunk_size="cutoff time")
    assert feature_matrix.shape[0] == 2
    assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_seed_features(es):
    seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2
    seed_feature_log = Hour(es['log']['datetime'])
    session_agg = Last(seed_feature_log, es['sessions'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=2,
                                   seed_features=[seed_feature_sessions,
                                                  seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name()
                                                for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
def test_empty_path_approximate_partial(entityset):
    es = copy.deepcopy(entityset)
    es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2]
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              es,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[dfeat.get_name()].tolist()
    assert vals1[0] == 7
    assert np.isnan(vals1[1])
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_approximate_child_aggs_handled_correctly(entityset):
    es = entityset
    agg_feat = Count(es['customers']['id'], es['regions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['log']['value'], es['customers'])
    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'),
                                       pd.Timestamp('2011-04-09 10:30:06')],
                              'instance_id': [0, 0]})

    fm = calculate_feature_matrix([dfeat],
                                  approximate=Timedelta(10, 's'),
                                  cutoff_time=cutoff_df)
    fm_2 = calculate_feature_matrix([dfeat, agg_feat_2],
                                    approximate=Timedelta(10, 's'),
                                    cutoff_time=cutoff_df)
    assert fm[dfeat.get_name()].tolist() == [2, 3]
    assert fm_2[agg_feat_2.get_name()].tolist() == [0, 2]
def test_make_agg_feat_using_prev_time(entityset, backend):
    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['sessions'],
                     use_previous=Timedelta(10, 's'))

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=datetime(2011, 4, 9, 10, 30, 10))

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=datetime(2011, 4, 9, 10, 30, 30))

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
def test_make_agg_feat_multiple_dtypes(entityset, backend):
    compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero'

    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['sessions'],
                     where=compare_prod)

    agg_feat2 = Mode(entityset['log']['product_id'],
                     parent_entity=entityset['sessions'],
                     where=compare_prod)

    pandas_backend = backend([agg_feat, agg_feat2])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
def test_training_window_recent_time_index(entityset):
    # customer with no sessions
    row = {
        'id': [3],
        'age': [73],
        u'région_id': ['United States'],
        'cohort': [1],
        'cancel_reason': ["I am finally awake!!"],
        'loves_ice_cream': [True],
        'favorite_quote': ["Who is John Galt?"],
        'signup_date': [datetime(2011, 4, 10)],
        'upgrade_date': [datetime(2011, 4, 12)],
        'cancel_date': [datetime(2011, 5, 13)],
        'date_of_birth': [datetime(1938, 2, 1)],
        'engagement_level': [2],
    }
    df = pd.DataFrame(row)
    df.index = range(3, 4)
    df = entityset['customers'].df.append(df, sort=False)
    entityset['customers'].update_data(df=df,
                                       recalculate_last_time_indexes=False)
    entityset.add_last_time_indexes()

    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])
    dagg = DirectFeature(top_level_agg, entityset['customers'])
    instance_ids = [0, 1, 2, 3]
    times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11),
             datetime(2011, 4, 10, 13, 10, 1), datetime(2011, 4, 10, 1, 59, 59)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids})
    feature_matrix = calculate_feature_matrix(
        [property_feature, dagg],
        entityset,
        cutoff_time=cutoff_time,
        training_window='2 hours'
    )
    prop_values = [5, 5, 1, 0]
    dagg_values = [3, 2, 1, 3]
    feature_matrix.sort_index(inplace=True)
    assert (feature_matrix[property_feature.get_name()] == prop_values).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_dfs_builds_on_seed_features_more_than_max_depth(es):
    seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2
    seed_feature_log = Hour(es['log']['datetime'])
    session_agg = Last(seed_feature_log, es['sessions'])

    # Depth of this feat is 2 relative to session_agg, the seed feature,
    # which is greater than max_depth so it shouldn't be built
    session_agg_trans = DirectFeature(Mode(session_agg, es['customers']),
                                      es['sessions'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Count],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[seed_feature_sessions,
                                                  seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name()
                                                for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
    assert session_agg_trans.get_name() not in [f.get_name()
                                                for f in features]
def test_make_agg_feat_where_count_feat(entityset, backend):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    Count.max_stack_depth = 2
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['sessions'])

    feat = Count(entityset['sessions']['id'],
                 parent_entity=entityset['customers'],
                 where=log_count_feat > 1)

    pandas_backend = backend([feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1],
                                               time_last=None)
    name = feat.get_name()
    instances = df[name]
    v0, v1 = instances[0:2]
    assert (v0 == 2)
    assert (v1 == 2)
def test_make_agg_feat_where_count_or_device_type_feat(entityset, backend):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    Count.max_stack_depth = 2
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['sessions'])

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(entityset['sessions']['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = Count(entityset['sessions']['id'],
                 parent_entity=entityset['customers'],
                 where=or_feat)

    pandas_backend = backend([feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 3)
def test_base_of_and_stack_on_heuristic(es, test_primitive):
    child = Count(es["sessions"]["id"], es["customers"])
    test_primitive.stack_on = []
    child.base_of = []
    assert not (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = []
    child.base_of = None
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = []
    child.base_of = [test_primitive]
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = None
    child.base_of = []
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = None
    child.base_of = None
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = None
    child.base_of = [test_primitive]
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = [child]
    child.base_of = []
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = [child]
    child.base_of = None
    assert (check_stacking(test_primitive, [child]))

    test_primitive.stack_on = [child]
    child.base_of = [test_primitive]
    assert (check_stacking(test_primitive, [child]))
def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self):
        where_str = self._where_str()
        use_prev_str = self._use_prev_str()
        return u"COUNT(%s%s%s)" % (self.child_entity.id,
                                   where_str,
                                   use_prev_str)

    Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric,
                               name="count", stack_on_self=False,
                               cls_attributes={"generate_name": count_generate_name})
    count_null = Count(es['log']['value'], es['sessions'], count_null=True)
    feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()
Beispiel #20
0
def test_cfm_returns_original_time_indexes(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es['regions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['sessions']['id'], es['customers'])
    cutoff_df = pd.DataFrame({
        'time': [
            pd.Timestamp('2011-04-09 10:30:06'),
            pd.Timestamp('2011-04-09 10:30:03'),
            pd.Timestamp('2011-04-08 10:30:00')
        ],
        'instance_id': [0, 1, 0]
    })
    sorted_df = cutoff_df.sort_values(['time', 'instance_id'],
                                      kind='mergesort')

    # no approximate
    fm = calculate_feature_matrix([dfeat],
                                  entityset,
                                  cutoff_time=cutoff_df,
                                  cutoff_time_in_index=True)
    instance_level_vals = fm.index.get_level_values(0).values
    time_level_vals = fm.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, no unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat, agg_feat_2],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, no unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat, agg_feat_2],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()
Beispiel #21
0
def test_feature_takes_timedelta_string(es):
    feature = Count(es['log']['id'], es['customers'],
                    use_previous="1 day")
    assert feature.use_previous == Timedelta(1, 'd')
Beispiel #22
0
def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self, base_feature_names, relationship_path_name,
                            parent_entity_id, where_str, use_prev_str):
        return u"COUNT(%s%s%s)" % (relationship_path_name,
                                   where_str,
                                   use_prev_str)

    Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric,
                               name="count", stack_on_self=False,
                               cls_attributes={"generate_name": count_generate_name})
    count_null = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Count(count_null=True))
    feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()
Beispiel #23
0
def child(es, child_entity):
    return Count(es['sessions']['id'], parent_entity=child_entity)