Exemple #1
0
def test_isin_feat_other_syntax_int(es):
    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
Exemple #2
0
def test_make_trans_feat(es):
    f = Hour(es['log']['datetime'])

    pandas_backend = PandasBackend(es, [f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert v == 10
Exemple #3
0
def test_isin_feat_other_syntax(es):
    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    pandas_backend = PandasBackend(es, [v, p])
    df = pandas_backend.calculate_all_features(range(17), None)
    true = df[v.get_name()].rank(pct=True)
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    d = DirectFeature(base_feature=es['sessions']['device_type'],
                      child_entity=es['log'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 5],
                                               time_last=None)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_diff_single_value(es):
    diff = ft.Feature(
        [es['stores']['num_square_feet'], es['stores'][u'région_id']],
        primitive=Diff)
    pandas_backend = PandasBackend(es, [diff])
    df = pandas_backend.calculate_all_features(instance_ids=[5],
                                               time_last=None)
    assert df.shape[0] == 1
    assert df[diff.get_name()].dropna().shape[0] == 0
def test_percentile(es):
    v = ft.Feature(es['log']['value'])
    p = ft.Feature(v, primitive=Percentile)
    pandas_backend = PandasBackend(es, [p])
    df = pandas_backend.calculate_all_features(range(10, 17), None)
    true = es['log'].df[v.get_name()].rank(pct=True)
    true = true.loc[range(10, 17)]
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
    def check(feature):
        pandas_backend = PandasBackend(es, [feature])
        df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                                     time_last=None)
        df_2 = pandas_backend.calculate_all_features(instance_ids=[2, 4],
                                                     time_last=None)

        # check that the value for instance id 2 matches
        assert (df_2.loc[2] == df_1.loc[2]).all()
def test_compare_all_nans(es):
    nan_feat = Mode(es['log']['product_id'], es['sessions'])
    compare = nan_feat == 'brown bag'
    # before all data
    time_last = pd.Timestamp('1/1/1993')
    pandas_backend = PandasBackend(es, [nan_feat, compare])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=time_last)
    assert df[nan_feat.get_name()].dropna().shape[0] == 0
    assert not df[compare.get_name()].any()
def test_not_feature(es):
    likes_ice_cream = es['customers']['loves_ice_cream']
    not_feat = Not(likes_ice_cream)
    features = [not_feat]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1],
                                               time_last=None)
    v = df[not_feat.get_name()].values
    assert not v[0]
    assert v[1]
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in,
        [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = IsIn(es['log']['product_id'],
                list_of_outputs=["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_cum_count(es):
    log_id_feat = es['log']['id']
    cum_count = CumCount(log_id_feat, es['log']['session_id'])
    features = [cum_count]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    cvalues = df[cum_count.get_name()].values
    assert len(cvalues) == 15
    cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3]
    for i, v in enumerate(cum_count_values):
        assert v == cvalues[i]
def test_cum_max(es):
    log_value_feat = es['log']['value']
    cum_max = CumMax(log_value_feat, es['log']['session_id'])
    features = [cum_max]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    cvalues = df[cum_max.get_name()].values
    assert len(cvalues) == 15
    cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14]
    for i, v in enumerate(cum_max_values):
        assert v == cvalues[i]
Exemple #14
0
def test_direct_percentile(es):
    v = Feature(es['customers']['age'])
    p = Percentile(v)
    d = Feature(p, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    cust_vals = es['customers'].df[[v.get_name()]]
    cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True)
    true_p = cust_vals['percentile'].loc[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
Exemple #15
0
def test_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['sessions'])
    pandas_backend = PandasBackend(es, [agg])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_cum_mean(es):
    log_value_feat = es['log']['value']
    cum_mean = CumMean(log_value_feat, es['log']['session_id'])
    features = [cum_mean]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    cvalues = df[cum_mean.get_name()].values
    assert len(cvalues) == 15
    cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7]
    for i, v in enumerate(cum_mean_values):
        assert v == cvalues[i]
def test_percentile_agg(es):
    v = ft.Feature(es['log']['value'])
    agg = ft.Feature(v, parent_entity=es['sessions'], primitive=Sum)
    pagg = ft.Feature(agg, primitive=Percentile)
    pandas_backend = PandasBackend(es, [pagg])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    true_p = log_vals.groupby('session_id')[v.get_name()].sum().fillna(0)
    true_p = true_p.rank(pct=True)[[0, 1]]

    for t, a in zip(true_p.values, df[pagg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_isnull_feat(es):
    value = IdentityFeature(es['log']['value'])
    diff = Diff(value, es['log']['session_id'])
    isnull = IsNull(diff)
    features = [isnull]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(15), None)
    # correct_vals_diff = [
    #     np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7]
    correct_vals = [True, False, False, False, False, True, False, False,
                    False, True, True, False, True, False, False]
    values = df[isnull.get_name()].values.tolist()
    assert correct_vals == values
def test_override_cmp_from_variable(es):
    count_lo = IdentityFeature(es['log']['value']) > 1

    to_test = [False, True, True]

    features = [count_lo]

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    v = df[count_lo.get_name()].values.tolist()
    for i, test in enumerate(to_test):
        assert v[i] == test
def test_cum_sum_use_previous(es):
    log_value_feat = es['log']['value']
    cum_sum = CumSum(log_value_feat, es['log']['session_id'],
                     use_previous=Timedelta(3, 'observations',
                                            entity=es['log']))
    features = [cum_sum]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 15
    cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21]
    for i, v in enumerate(cum_sum_values):
        assert v == cvalues[i]
Exemple #21
0
def test_direct_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['customers'])
    d = Feature(agg, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2
    true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0)
    true_p = true_p[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_cum_sum_where(es):
    log_value_feat = es['log']['value']
    compare_feat = GreaterThan(log_value_feat, 3)
    dfeat = Feature(es['sessions']['customer_id'], es['log'])
    cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat)
    features = [cum_sum]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 15
    cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50, 0, 5, 5, 12, 26]
    for i, v in enumerate(cum_sum_values):
        if not np.isnan(v):
            assert v == cvalues[i]
        else:
            assert (np.isnan(cvalues[i]))
Exemple #23
0
def test_haversine(es):
    log_latlong_feat = es['log']['latlong']
    log_latlong_feat2 = es['log']['latlong2']
    haversine = Haversine(log_latlong_feat, log_latlong_feat2)
    features = [haversine]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    values = df[haversine.get_name()].values
    real = [
        0., 524.15585776, 1043.00845747, 1551.12130243, 2042.79840241, 0.,
        137.86000883, 275.59396684, 413.07563177, 0., 0., 524.15585776, 0.,
        739.93819145, 1464.27975511
    ]
    assert len(values) == 15
    for i, v in enumerate(real):
        assert v - values[i] < .0001
def test_cum_sum_use_previous_and_where_absolute(es):
    log_value_feat = es['log']['value']
    compare_feat = GreaterThan(log_value_feat, 3)
    dfeat = Feature(es['sessions']['customer_id'], es['log'])
    cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"],
                     where=compare_feat,
                     use_previous=Timedelta(40, 'seconds'))
    features = [cum_sum]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)

    cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0,
                      0, 5, 0, 7, 21]
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 15
    for i, v in enumerate(cum_sum_values):
        assert v == cvalues[i]
Exemple #25
0
def test_latlong(es):
    log_latlong_feat = es['log']['latlong']
    latitude = Latitude(log_latlong_feat)
    longitude = Longitude(log_latlong_feat)
    features = [latitude, longitude]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    latvalues = df[latitude.get_name()].values
    lonvalues = df[longitude.get_name()].values
    assert len(latvalues) == 15
    assert len(lonvalues) == 15
    real_lats = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14]
    real_lons = [0, 2, 4, 6, 8, 0, 1, 2, 3, 0, 0, 2, 0, 3, 6]
    for i, v, in enumerate(real_lats):
        assert v == latvalues[i]
    for i, v, in enumerate(real_lons):
        assert v == lonvalues[i]
def test_compare_of_transform(es):
    day = Day(es['log']['datetime'])
    to_test = [(Equals, [False, True]), (NotEquals, [True, False]),
               (LessThan, [True, False]), (LessThanEqualTo, [True, True]),
               (GreaterThan, [False, False]),
               (GreaterThanEqualTo, [False, True])]

    features = []
    for test in to_test:
        features.append(test[0](day, 10))

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 14],
                                               time_last=None)

    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]
Exemple #27
0
def test_override_boolean(es):
    count = Count(es['log']['value'], es['sessions'])
    count_lo = GreaterThan(count, 1)
    count_hi = LessThan(count, 10)

    to_test = [[True, True, True], [True, True, False], [False, False, True]]

    features = []
    features.append(count_lo.OR(count_hi))
    features.append(count_lo.AND(count_hi))
    features.append(~(count_lo.AND(count_hi)))

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test
Exemple #28
0
def test_two_kinds_of_dependents(es):
    v = Feature(es['log']['value'])
    product = Feature(es['log']['product_id'])
    agg = Sum(v, es['customers'], where=product == 'coke zero')
    p = Percentile(agg)
    g = Absolute(agg)
    agg2 = Sum(v, es['sessions'], where=product == 'coke zero')
    # Adding this feature in tests line 218 in pandas_backend
    # where we remove columns in result_frame that already exist
    # in the output entity_frames in preparation for pd.concat
    # In a prior version, this failed because we changed the result_frame
    # variable itself, rather than making a new variable _result_frame.
    # When len(output_frames) > 1, the second iteration won't have
    # all the necessary columns because they were removed in the first
    agg3 = Sum(agg2, es['customers'])
    pandas_backend = PandasBackend(es, [p, g, agg3])
    df = pandas_backend.calculate_all_features([0, 1], None)
    assert df[p.get_name()].tolist() == [0.5, 1.0]
    assert df[g.get_name()].tolist() == [15, 26]
def test_arithmetic_of_agg(es):
    customer_id_feat = es['customers']['id']
    store_id_feat = es['stores']['id']
    count_customer = Count(customer_id_feat, parent_entity=es['regions'])
    count_stores = Count(store_id_feat, parent_entity=es['regions'])
    to_test = [(Add, [6, 2]), (Subtract, [0, -2]), (Multiply, [9, 0]),
               (Divide, [1, 0])]

    features = []
    for test in to_test:
        features.append(test[0](count_customer, count_stores))

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(
        instance_ids=['United States', 'Mexico'], time_last=None)

    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]
def test_cum_sum_group_on_nan(es):
    log_value_feat = es['log']['value']
    es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 +
                                  ['toothpaste'] * 3 + ['brown bag'] * 2 +
                                  ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2)
    cum_sum = CumSum(log_value_feat, es['log']['product_id'])
    features = [cum_sum]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 15
    cum_sum_values = [
        0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan
    ]
    for i, v in enumerate(cum_sum_values):
        if np.isnan(v):
            assert (np.isnan(cvalues[i]))
        else:
            assert v == cvalues[i]