def test_cum_sum_use_previous_and_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) # todo should this be cummean? dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat, use_previous=Timedelta(3, 'observations', entity=es['log'])) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_sum_values = [0, 5, 15, 30, 45, 45, 45, 45, 45, 45, 0, 5, 5, 12, 26] cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_approx_base_feature_is_also_first_class_feature(entityset): es = entityset log_to_products = DirectFeature(es['products']['rating'], es['log']) # This should still be computed properly agg_feat = Min(log_to_products, es['sessions']) customer_agg_feat = Sum(agg_feat, es['customers']) # This is to be approximated sess_to_cust = DirectFeature(customer_agg_feat, es['sessions']) feature_matrix = calculate_feature_matrix( [sess_to_cust, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) vals1 = feature_matrix[sess_to_cust.get_name()].tolist() assert vals1 == [8.5, 7] vals2 = feature_matrix[agg_feat.get_name()].tolist() assert vals2 == [4, 1.5]
def test_training_window(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) # make sure features that have a direct to a higher level agg # so we have multiple "filter eids" in get_pandas_data_slice, # and we go through the loop to pull data with a training_window param more than once dagg = DirectFeature(top_level_agg, entityset['customers']) # for now, warns if last_time_index not present feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, instance_ids=[0, 1, 2], cutoff_time=[datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1)], training_window='2 hours') entityset.add_last_time_indexes() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([property_feature], entityset, instance_ids=[0, 1, 2], cutoff_time=[datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1)], training_window=Timedelta(2, 'observations', entity='log')) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, instance_ids=[0, 1, 2, 4], cutoff_time=[datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1)], training_window='2 hours') prop_values = [5, 5, 1] dagg_values = [3, 2, 1] assert (feature_matrix[property_feature.get_name()] == prop_values).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_make_agg_feat_using_prev_time(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], use_previous=Timedelta(10, 's'), primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 9, 10, 30, 10), feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 2) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 1)
def test_uses_full_entity_feat_of_approximate(entityset): es = entityset agg_feat = Sum(es['log']['value'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) agg_feat3 = Min(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) dfeat2 = DirectFeature(agg_feat3, es['sessions']) p = Percentile(dfeat) # only dfeat2 should be approximated # because Percentile needs all values feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0] feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist( ) == feature_matrix_approx[dfeat2.get_name()].tolist() feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 'ms'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) for f in [p, dfeat, agg_feat]: for fm1, fm2 in combinations([ feature_matrix_approx, feature_matrix_small_approx, feature_matrix_no_approx ], 2): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()