def test_cum_sum_use_previous_group_on_nan(es): # TODO: Figure out how to test where `df` # in pd_rolling get_function() has multiindex log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id'], es["log"]["datetime"], use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 15, 35, 0, 1, 3, 3, 0, 0, np.nan, np.nan, np.nan, np.nan] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]
def test_arithmetic_of_val(es): to_test = [(Add, [2.0, 7.0, 12.0, 17.0], [2.0, 7.0, 12.0, 17.0]), (Subtract, [-2.0, 3.0, 8.0, 13.0], [2.0, -3.0, -8.0, -13.0]), (Multiply, [0, 10, 20, 30], [0, 10, 20, 30]), (Divide, [0, 2.5, 5, 7.5], [np.inf, 0.4, 0.2, 2 / 15.0], [np.nan, np.inf, np.inf, np.inf])] features = [] logs = es['log'] for test in to_test: features.append(test[0](logs['value'], 2)) features.append(test[0](2, logs['value'])) features.append(Divide(logs['value'], 0)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test): v = df[features[2 * i].get_name()].values.tolist() assert v == test[1] v = df[features[2 * i + 1].get_name()].values.tolist() assert v == test[2] test = to_test[-1][-1] v = df[features[-1].get_name()].values.tolist() assert (np.isnan(v[0])) assert v[1:] == test[1:]
def check(feature): pandas_backend = PandasBackend(es, [feature]) df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) df_2 = pandas_backend.calculate_all_features(instance_ids=[2], time_last=None) # check that the value for instance id 2 matches assert (df_2.loc[2] == df_1.loc[2]).all()
def test_diff(es): value = IdentityFeature(es['log']['value']) customer_id_feat = \ DirectFeature(es['sessions']['customer_id'], child_entity=es['log']) diff1 = Diff(value, es['log']['session_id']) diff2 = Diff(value, customer_id_feat) pandas_backend = PandasBackend(es, [diff1, diff2]) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) val1 = df[diff1.get_name()].values.tolist() val2 = df[diff2.get_name()].values.tolist() correct_vals1 = [ np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7 ] correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7] for i, v in enumerate(val1): v1 = val1[i] if np.isnan(v1): assert (np.isnan(correct_vals1[i])) else: assert v1 == correct_vals1[i] v2 = val2[i] if np.isnan(v2): assert (np.isnan(correct_vals2[i])) else: assert v2 == correct_vals2[i]
def test_arithmetic_of_direct(es): rating = es['products']['rating'] log_rating = DirectFeature(rating, child_entity=es['log']) customer_age = es['customers']['age'] session_age = DirectFeature(customer_age, child_entity=es['sessions']) log_age = DirectFeature(session_age, child_entity=es['log']) to_test = [(Add, [38, 37, 37.5, 37.5]), (Subtract, [28, 29, 28.5, 28.5]), (Multiply, [165, 132, 148.5, 148.5]), (Divide, [6.6, 8.25, 22. / 3, 22. / 3])] features = [] for test in to_test: features.append(test[0](log_age, log_rating)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 3, 5, 7], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_override_cmp(es): count = Count(es['log']['id'], es['sessions']) _sum = Sum(es['log']['value'], es['sessions']) gt_lo = count > 1 gt_other = count > _sum ge_lo = count >= 1 ge_other = count >= _sum lt_hi = count < 10 lt_other = count < _sum le_hi = count <= 10 le_other = count <= _sum ne_lo = count != 1 ne_other = count != _sum to_test = [[True, True, False], [False, False, True], [True, True, True], [False, False, True], [True, True, True], [True, True, False], [True, True, True], [True, True, False]] features = [gt_lo, gt_other, ge_lo, ge_other, lt_hi, lt_other, le_hi, le_other, ne_lo, ne_other] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test
def test_diff_single_value(es): diff = Diff(es['stores']['num_square_feet'], es['stores'][u'région_id']) pandas_backend = PandasBackend(es, [diff]) df = pandas_backend.calculate_all_features(instance_ids=[5], time_last=None) assert df.shape[0] == 1 assert df[diff.get_name()].dropna().shape[0] == 0
def test_percentile_with_cutoff(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [p]) df = pandas_backend.calculate_all_features( [2], pd.Timestamp('2011/04/09 10:30:13')) assert df[p.get_name()].tolist()[0] == 1.0
def test_cum_sum_group_on_nan(es): log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id']) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]
def test_make_trans_feat(es): f = Hour(es['log']['datetime']) pandas_backend = PandasBackend(es, [f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert v == 10
def test_isin_feat_other_syntax(es): isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v
def test_isin_feat_other_syntax_int(es): isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [p]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity d = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_not_feature(es): likes_ice_cream = es['customers']['loves_ice_cream'] not_feat = Not(likes_ice_cream) features = [not_feat] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1], time_last=None) v = df[not_feat.get_name()].values assert not v[0] assert v[1]
def test_compare_all_nans(es): nan_feat = Mode(es['log']['product_id'], es['sessions']) compare = nan_feat == 'brown bag' # before all data time_last = pd.Timestamp('1/1/1993') pandas_backend = PandasBackend(es, [nan_feat, compare]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=time_last) assert df[nan_feat.get_name()].dropna().shape[0] == 0 assert not df[compare.get_name()].any()
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_dependent_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) p2 = Percentile(p - 1) pandas_backend = PandasBackend(es, [p, p2]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_cum_mean(es): log_value_feat = es['log']['value'] cum_mean = CumMean(log_value_feat, es['log']['session_id']) features = [cum_mean] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_mean.get_name()].values assert len(cvalues) == 15 cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7] for i, v in enumerate(cum_mean_values): assert v == cvalues[i]
def test_cum_max(es): log_value_feat = es['log']['value'] cum_max = CumMax(log_value_feat, es['log']['session_id']) features = [cum_max] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_max.get_name()].values assert len(cvalues) == 15 cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14] for i, v in enumerate(cum_max_values): assert v == cvalues[i]
def test_cum_count(es): log_id_feat = es['log']['id'] cum_count = CumCount(log_id_feat, es['log']['session_id']) features = [cum_count] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_count.get_name()].values assert len(cvalues) == 15 cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3] for i, v in enumerate(cum_count_values): assert v == cvalues[i]
def test_direct_percentile(es): v = Feature(es['customers']['age']) p = Percentile(v) d = Feature(p, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) cust_vals = es['customers'].df[[v.get_name()]] cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals['percentile'].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_override_cmp_from_variable(es): count_lo = IdentityFeature(es['log']['value']) > 1 to_test = [False, True, True] features = [count_lo] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) v = df[count_lo.get_name()].values.tolist() for i, test in enumerate(to_test): assert v[i] == test
def test_isnull_feat(es): value = IdentityFeature(es['log']['value']) diff = Diff(value, es['log']['session_id']) isnull = IsNull(diff) features = [isnull] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(15), None) # correct_vals_diff = [ # np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7] correct_vals = [True, False, False, False, False, True, False, False, False, True, True, False, True, False, False] values = df[isnull.get_name()].values.tolist() assert correct_vals == values
def test_percentile_agg(es): v = ft.Feature(es['log']['value']) agg = ft.Feature(v, parent_entity=es['sessions'], primitive=Sum) pagg = ft.Feature(agg, primitive=Percentile) pandas_backend = PandasBackend(es, [pagg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] true_p = log_vals.groupby('session_id')[v.get_name()].sum().fillna(0) true_p = true_p.rank(pct=True)[[0, 1]] for t, a in zip(true_p.values, df[pagg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_cum_sum_use_previous(es): log_value_feat = es['log']['value'] cum_sum = CumSum(log_value_feat, es['log']['session_id'], use_previous=Timedelta(3, 'observations', entity=es['log'])) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_direct_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['customers']) d = Feature(agg, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2 true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0) true_p = true_p[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_haversine(es): log_latlong_feat = es['log']['latlong'] log_latlong_feat2 = es['log']['latlong2'] haversine = Haversine(log_latlong_feat, log_latlong_feat2) features = [haversine] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) values = df[haversine.get_name()].values real = [0., 524.15585776, 1043.00845747, 1551.12130243, 2042.79840241, 0., 137.86000883, 275.59396684, 413.07563177, 0., 0., 524.15585776, 0., 739.93819145, 1464.27975511] assert len(values) == 15 for i, v in enumerate(real): assert v - values[i] < .0001
def test_haversine(es): log_latlong_feat = es['log']['latlong'] log_latlong_feat2 = es['log']['latlong2'] haversine = Haversine(log_latlong_feat, log_latlong_feat2) features = [haversine] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) values = df[haversine.get_name()].values real = [ 0., 524.15585776, 1043.00845747, 1551.12130243, 2042.79840241, 0., 137.86000883, 275.59396684, 413.07563177, 0., 0., 524.15585776, 0., 739.93819145, 1464.27975511 ] assert len(values) == 15 for i, v in enumerate(real): assert v - values[i] < .0001
def test_cum_sum_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50, 0, 5, 5, 12, 26] for i, v in enumerate(cum_sum_values): if not np.isnan(v): assert v == cvalues[i] else: assert (np.isnan(cvalues[i]))
def test_cum_sum_use_previous_and_where_absolute(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"], where=compare_feat, use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0, 0, 5, 0, 7, 21] cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_latlong(es): log_latlong_feat = es['log']['latlong'] latitude = Latitude(log_latlong_feat) longitude = Longitude(log_latlong_feat) features = [latitude, longitude] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) latvalues = df[latitude.get_name()].values lonvalues = df[longitude.get_name()].values assert len(latvalues) == 15 assert len(lonvalues) == 15 real_lats = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14] real_lons = [0, 2, 4, 6, 8, 0, 1, 2, 3, 0, 0, 2, 0, 3, 6] for i, v, in enumerate(real_lats): assert v == latvalues[i] for i, v, in enumerate(real_lons): assert v == lonvalues[i]
def test_compare_of_transform(es): day = Day(es['log']['datetime']) to_test = [(Equals, [False, True]), (NotEquals, [True, False]), (LessThan, [True, False]), (LessThanEqualTo, [True, True]), (GreaterThan, [False, False]), (GreaterThanEqualTo, [False, True])] features = [] for test in to_test: features.append(test[0](day, 10)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 14], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_override_boolean(es): count = Count(es['log']['value'], es['sessions']) count_lo = GreaterThan(count, 1) count_hi = LessThan(count, 10) to_test = [[True, True, True], [True, True, False], [False, False, True]] features = [] features.append(count_lo.OR(count_hi)) features.append(count_lo.AND(count_hi)) features.append(~(count_lo.AND(count_hi))) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test
def test_two_kinds_of_dependents(es): v = Feature(es['log']['value']) product = Feature(es['log']['product_id']) agg = Sum(v, es['customers'], where=product == 'coke zero') p = Percentile(agg) g = Absolute(agg) agg2 = Sum(v, es['sessions'], where=product == 'coke zero') # Adding this feature in tests line 218 in pandas_backend # where we remove columns in result_frame that already exist # in the output entity_frames in preparation for pd.concat # In a prior version, this failed because we changed the result_frame # variable itself, rather than making a new variable _result_frame. # When len(output_frames) > 1, the second iteration won't have # all the necessary columns because they were removed in the first agg3 = Sum(agg2, es['customers']) pandas_backend = PandasBackend(es, [p, g, agg3]) df = pandas_backend.calculate_all_features([0, 1], None) assert df[p.get_name()].tolist() == [0.5, 1.0] assert df[g.get_name()].tolist() == [15, 26]
def test_compare_of_identity(es): to_test = [(Equals, [False, False, True, False]), (NotEquals, [True, True, False, True]), (LessThan, [True, True, False, False]), (LessThanEqualTo, [True, True, True, False]), (GreaterThan, [False, False, False, True]), (GreaterThanEqualTo, [False, False, True, True])] features = [] for test in to_test: features.append(test[0](es['log']['value'], 10)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_arithmetic_of_agg(es): customer_id_feat = es['customers']['id'] store_id_feat = es['stores']['id'] count_customer = Count(customer_id_feat, parent_entity=es['regions']) count_stores = Count(store_id_feat, parent_entity=es['regions']) to_test = [(Add, [6, 2]), (Subtract, [0, -2]), (Multiply, [9, 0]), (Divide, [1, 0])] features = [] for test in to_test: features.append(test[0](count_customer, count_stores)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features( instance_ids=['United States', 'Mexico'], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_cum_mean_use_previous_and_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) # todo should this be cummean? dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_mean = CumMean(log_value_feat, dfeat, where=compare_feat, use_previous=Timedelta(2, 'observations', entity=es['log'])) features = [cum_mean] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_mean_values = [0, 5, 7.5, 12.5, 17.5, 17.5, 17.5, 17.5, 17.5, 17.5, 0, 5, 5, 6, 10.5] cvalues = df[cum_mean.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_mean_values): assert v == cvalues[i]
def test_text_primitives(es): words = NumWords(es['log']['comments']) chars = NumCharacters(es['log']['comments']) features = [words, chars] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) word_counts = [514, 3, 3, 644, 1268, 1269, 177, 172, 79, 240, 1239, 3, 3, 3, 3] char_counts = [3392, 10, 10, 4116, 7961, 7580, 992, 957, 437, 1325, 6322, 10, 10, 10, 10] word_values = df[words.get_name()].values char_values = df[chars.get_name()].values assert len(word_values) == 15 for i, v in enumerate(word_values): assert v == word_counts[i] for i, v in enumerate(char_values): assert v == char_counts[i]
def test_cum_sum_group_on_nan(es): log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id']) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [ 0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan ] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]
def test_compare_of_direct(es): log_rating = DirectFeature(es['products']['rating'], child_entity=es['log']) to_test = [(Equals, [False, False, False, False]), (NotEquals, [True, True, True, True]), (LessThan, [False, False, False, True]), (LessThanEqualTo, [False, False, False, True]), (GreaterThan, [True, True, True, False]), (GreaterThanEqualTo, [True, True, True, False])] features = [] for test in to_test: features.append(test[0](log_rating, 4.5)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_arithmetic_of_transform(es): diff1 = ft.Feature([es['log']['value']], primitive=Diff) diff2 = ft.Feature([es['log']['value_2']], primitive=Diff) to_test = [(AddNumeric, [np.nan, 14., -7., 3.]), (SubtractNumeric, [np.nan, 6., -3., 1.]), (MultiplyNumeric, [np.nan, 40., 10., 2.]), (DivideNumeric, [np.nan, 2.5, 2.5, 2.])] features = [] for test in to_test: features.append(ft.Feature([diff1, diff2], primitive=test[0]())) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 2, 11, 13], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert np.isnan(v.pop(0)) assert np.isnan(test[1].pop(0)) assert v == test[1]
def test_arithmetic(es): # P TODO: return hour = Hour(es['log']['datetime']) day = Day(es['log']['datetime']) to_test = [(Add, [19, 19, 19, 19]), (Subtract, [-1, -1, -1, -1]), (Multiply, [90, 90, 90, 90]), (Divide, [.9, .9, .9, .9])] features = [] features.append(day + hour) features.append(day - hour) features.append(day * hour) features.append(day / hour) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 3, 5, 7], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_compare_of_agg(es): count_logs = Count(es['log']['id'], parent_entity=es['sessions']) to_test = [(Equals, [False, False, False, True]), (NotEquals, [True, True, True, False]), (LessThan, [False, False, True, False]), (LessThanEqualTo, [False, False, True, True]), (GreaterThan, [True, True, False, False]), (GreaterThanEqualTo, [True, True, False, True])] features = [] for test in to_test: features.append(test[0](count_logs, 2)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_arithmetic_of_identity(es): logs = es['log'] to_test = [(Add, [0., 7., 14., 21.]), (Subtract, [0, 3, 6, 9]), (Multiply, [0, 10, 40, 90]), (Divide, [np.nan, 2.5, 2.5, 2.5])] features = [] for test in to_test: features.append(test[0](logs['value'], logs['value_2'])) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test[:-1]): v = df[features[i].get_name()].values.tolist() assert v == test[1] i, test = 3, to_test[-1] v = df[features[i].get_name()].values.tolist() assert (np.isnan(v[0])) assert v[1:] == test[1][1:]