def test_encode_features_handles_pass_columns(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) f2 = IdentityFeature(entityset["log"]["value"]) features = [f1, f2] cutoff_time = pd.DataFrame( { 'instance_id': range(6), 'time': entityset['log'].df['datetime'][0:6], 'label': [i % 2 for i in range(6)] }, columns=["instance_id", "time", "label"]) feature_matrix = calculate_feature_matrix(features, cutoff_time) assert 'label' in feature_matrix.columns feature_matrix_encoded, features_encoded = encode_features( feature_matrix, features) feature_matrix_encoded_shape = feature_matrix_encoded.shape # to_encode should keep product_id as a string, and not create 3 additional columns to_encode = [] feature_matrix_encoded, features_encoded = encode_features( feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape to_encode = ['value'] feature_matrix_encoded, features_encoded = encode_features( feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape assert 'label' in feature_matrix_encoded.columns
def test_copy_features_does_not_copy_entityset(es): agg = Sum(es['log']['value'], es['sessions']) agg_where = Sum(es['log']['value'], es['sessions'], where=IdentityFeature(es['log']['value']) == 2) agg_use_previous = Sum(es['log']['value'], es['sessions'], use_previous='4 days') agg_use_previous_where = Sum(es['log']['value'], es['sessions'], where=IdentityFeature(es['log']['value']) == 2, use_previous='4 days') features = [agg, agg_where, agg_use_previous, agg_use_previous_where] in_memory_size = asizeof(locals()) copied = [f.copy() for f in features] new_in_memory_size = asizeof(locals()) assert new_in_memory_size < 2 * in_memory_size for f, c in zip(features, copied): assert f.entityset assert c.entityset assert id(f.entityset) == id(c.entityset) if f.where: assert c.where assert id(f.where.entityset) == id(c.where.entityset) for bf, bf_c in zip(f.base_features, c.base_features): assert id(bf.entityset) == id(bf_c.entityset) if bf.where: assert bf_c.where assert id(bf.where.entityset) == id(bf_c.where.entityset)
def test_cfm_approximate_correct_ordering(): trips = { 'trip_id': [i for i in range(1000)], 'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)], 'flight_id': [randint(1, 25) for i in range(1000)], 'trip_duration': [randint(1, 999) for i in range(1000)] } df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) features = dfs(entityset=es, target_entity='trips', features_only=True) flight_features = [ feature for feature in features if isinstance(feature, DirectFeature) and isinstance(feature.base_features[0], AggregationPrimitive) ] property_feature = IdentityFeature(es['trips']['trip_id']) # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'], # es['flights']), # es['trips']) cutoff_time = pd.DataFrame.from_dict({ 'instance_id': df['trip_id'], 'time': df['flight_time'] }) time_feature = IdentityFeature(es['trips']['flight_time']) feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature], es, cutoff_time_in_index=True, cutoff_time=cutoff_time) feature_matrix.index.names = ['instance', 'time'] assert (np.all( feature_matrix.reset_index('time').reset_index()[['instance', 'time']]. values == feature_matrix[['trip_id', 'flight_time']].values)) feature_matrix_2 = calculate_feature_matrix( flight_features + [property_feature, time_feature], es, cutoff_time=cutoff_time, cutoff_time_in_index=True, approximate=Timedelta(2, 'd')) feature_matrix_2.index.names = ['instance', 'time'] assert (np.all( feature_matrix_2.reset_index('time').reset_index()[[ 'instance', 'time' ]].values == feature_matrix_2[['trip_id', 'flight_time']].values)) for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)): import pdb pdb.set_trace() assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_make_identity(entityset, backend): f = IdentityFeature(entityset['log']['datetime']) pandas_backend = backend([f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_override_cmp_from_variable(es): count_lo = IdentityFeature(es['log']['value']) > 1 to_test = [False, True, True] features = [count_lo] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) v = df[count_lo.get_name()].values.tolist() for i, test in enumerate(to_test): assert v[i] == test
def test_integer_time_index_passes_extra_columns(int_es): times = list(range(8, 18)) + list(range(19, 23)) + [25, 24, 23] labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True] instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14] cutoff_df = pd.DataFrame({'time': times, 'instance_id': instances, 'labels': labels}) cutoff_df = cutoff_df[['time', 'instance_id', 'labels']] property_feature = IdentityFeature(int_es['log']['value']) > 10 fm = calculate_feature_matrix([property_feature], cutoff_time=cutoff_df, cutoff_time_in_index=True) assert (fm[property_feature.get_name()] == fm['labels']).all()
def test_dask_persisted_entityset(entityset, capsys): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 with cluster() as (scheduler, [a, b]): dkwargs = {'cluster': scheduler['address']} feature_matrix = calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, dask_kwargs=dkwargs, approximate='1 hour') assert (feature_matrix == labels).values.all() feature_matrix = calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, dask_kwargs=dkwargs, approximate='1 hour') captured = capsys.readouterr() assert "Using EntitySet persisted on the cluster as dataset " in captured[ 0] assert (feature_matrix == labels).values.all()
def test_make_dfeat_of_agg_feat_through_parent(entityset, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(entityset['stores']['id']) store_count_feat = Count(store_id_feat, parent_entity=entityset[u'régions']) num_stores_feat = DirectFeature(store_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_datetime_index_mixed_cutoff(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [17] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True] instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14] cutoff_df = pd.DataFrame({ 'time': times, 'instance_id': instances, 'labels': labels }) cutoff_df = cutoff_df[['time', 'instance_id', 'labels']] property_feature = IdentityFeature(entityset['log']['value']) > 10 with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times[9] = "foobar" cutoff_df['time'] = times with pytest.raises(ValueError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) cutoff_df['time'].iloc[9] = '2018-04-02 18:50:45.453216' with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times[9] = '17' cutoff_df['time'] = times with pytest.raises(ValueError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)
def test_integer_time_index_mixed_cutoff(int_es): times_dt = list(range( 8, 17)) + [datetime(2011, 1, 1), 19, 20, 21, 22, 25, 24, 23] labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True] instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14] cutoff_df = pd.DataFrame({ 'time': times_dt, 'instance_id': instances, 'labels': labels }) cutoff_df = cutoff_df[['time', 'instance_id', 'labels']] property_feature = IdentityFeature(int_es['log']['value']) > 10 with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times_str = list(range(8, 17)) + ["foobar", 19, 20, 21, 22, 25, 24, 23] cutoff_df['time'] = times_str with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times_date_str = list(range( 8, 17)) + ['2018-04-02', 19, 20, 21, 22, 25, 24, 23] cutoff_df['time'] = times_date_str with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) [19, 20, 21, 22] times_int_str = [0, 1, 2, 3, 4, 5, '6', 7, 8, 9, 9, 10, 11, 12, 15, 14, 13] times_int_str = list(range(8, 17)) + ['17', 19, 20, 21, 22, 25, 24, 23] cutoff_df['time'] = times_int_str with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)
def test_saveprogress(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) property_feature = IdentityFeature(entityset['log']['value']) > 10 save_progress = tempfile.mkdtemp() fm_save = calculate_feature_matrix([property_feature], entityset, instance_ids=range(17), cutoff_time=times, save_progress=save_progress) _, _, files = next(os.walk(save_progress)) files = [os.path.join(save_progress, file) for file in files] # there is 17 datetime files created above assert len(files) == 17 list_df = [] for file_ in files: df = pd.read_csv(file_, index_col="id", header=0) list_df.append(df) merged_df = pd.concat(list_df) merged_df.set_index(pd.DatetimeIndex(times), append=True, inplace=True) fm_no_save = calculate_feature_matrix([property_feature], entityset, instance_ids=range(17), cutoff_time=times) assert np.all((merged_df.sort_index().values) == (fm_save.sort_index().values)) assert np.all((fm_no_save.sort_index().values) == (fm_save.sort_index().values)) assert np.all((fm_no_save.sort_index().values) == (merged_df.sort_index().values)) shutil.rmtree(save_progress)
def test_calc_feature_matrix(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 property_feature = IdentityFeature(entityset['log']['value']) > 10 feature_matrix = calculate_feature_matrix([property_feature], entityset, instance_ids=range(17), cutoff_time=times, verbose=True) assert (feature_matrix == labels).values.all() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix('features', entityset, instance_ids=range(17), cutoff_time=times) with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([], entityset, instance_ids=range(17), cutoff_time=times) with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([1, 2, 3], entityset, instance_ids=range(17), cutoff_time=times) with pytest.raises(TypeError): calculate_feature_matrix([property_feature], entityset, instance_ids=range(17), cutoff_time=17)
def test_encode_features_catches_features_mismatch(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) f2 = IdentityFeature(entityset["log"]["value"]) f3 = IdentityFeature(entityset["log"]["session_id"]) features = [f1, f2] cutoff_time = pd.DataFrame({'instance_id': range(6), 'time': entityset['log'].df['datetime'][0:6], 'label': [i % 2 for i in range(6)]}, columns=["instance_id", "time", "label"]) feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time) assert 'label' in feature_matrix.columns with pytest.raises(AssertionError): encode_features(feature_matrix, [f1, f3])
def test_diff(es): value = IdentityFeature(es['log']['value']) customer_id_feat = \ DirectFeature(es['sessions']['customer_id'], child_entity=es['log']) diff1 = Diff(value, es['log']['session_id']) diff2 = Diff(value, customer_id_feat) pandas_backend = PandasBackend(es, [diff1, diff2]) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) val1 = df[diff1.get_name()].values.tolist() val2 = df[diff2.get_name()].values.tolist() correct_vals1 = [ np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7 ] correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7] for i, v in enumerate(val1): v1 = val1[i] if np.isnan(v1): assert (np.isnan(correct_vals1[i])) else: assert v1 == correct_vals1[i] v2 = val2[i] if np.isnan(v2): assert (np.isnan(correct_vals2[i])) else: assert v2 == correct_vals2[i]
def test_encodes_features(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) f2 = IdentityFeature(entityset["log"]["purchased"]) f3 = IdentityFeature(entityset["log"]["value"]) features = [f1, f2, f3] feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5]) feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) assert len(features_encoded) == 6 feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=2) assert len(features_encoded) == 5 feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, include_unknown=False) assert len(features_encoded) == 5
def test_direct_squared(entityset, backend): feature = IdentityFeature(entityset['log']['value']) squared = feature * feature pandas_backend = backend([feature, squared]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) for i, row in df.iterrows(): assert (row[0] * row[0]) == row[1]
def test_to_encode_features(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) f2 = IdentityFeature(entityset["log"]["value"]) features = [f1, f2] feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5]) feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) feature_matrix_encoded_shape = feature_matrix_encoded.shape # to_encode should keep product_id as a string, and not create 3 additional columns to_encode = [] feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape to_encode = ['value'] feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
def test_integer_time_index_datetime_cutoffs(int_es): times = [datetime.now()] * 17 cutoff_df = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(int_es['log']['value']) > 10 with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df, cutoff_time_in_index=True)
def test_make_agg_feat_where_count(entityset, backend): agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions'], where=IdentityFeature(entityset['log']['product_id']) == 'coke zero') pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 3)
def test_isnull_feat(es): value = IdentityFeature(es['log']['value']) diff = Diff(value, es['log']['session_id']) isnull = IsNull(diff) features = [isnull] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(15), None) # correct_vals_diff = [ # np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7] correct_vals = [True, False, False, False, False, True, False, False, False, True, True, False, True, False, False] values = df[isnull.get_name()].values.tolist() assert correct_vals == values
def test_arithmetic_of_transform(es): diff1 = Diff(IdentityFeature(es['log']['value']), IdentityFeature(es['log']['product_id'])) diff2 = Diff(IdentityFeature(es['log']['value_2']), IdentityFeature(es['log']['product_id'])) to_test = [(Add, [np.nan, 14., -7., 3.]), (Subtract, [np.nan, 6., -3., 1.]), (Multiply, [np.nan, 40., 10., 2.]), (Divide, [np.nan, 2.5, 2.5, 2.])] features = [] for test in to_test: features.append(test[0](diff1, diff2)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 2, 11, 13], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert np.isnan(v.pop(0)) assert np.isnan(test[1].pop(0)) assert v == test[1]
def test_integer_time_index(int_es): times = list(range(8, 18)) + list(range(19, 26)) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_df = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(int_es['log']['value']) > 10 feature_matrix = calculate_feature_matrix([property_feature], cutoff_time=cutoff_df, cutoff_time_in_index=True) time_level_vals = feature_matrix.index.get_level_values(1).values sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort') assert (time_level_vals == sorted_df['time'].values).all() assert (feature_matrix == labels).values.all()
def test_inplace_encodes_features(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) features = [f1] feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5]) feature_matrix_shape = feature_matrix.shape feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) assert feature_matrix_encoded.shape != feature_matrix_shape assert feature_matrix.shape == feature_matrix_shape # inplace they should be the same feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True) assert feature_matrix_encoded.shape == feature_matrix.shape
def test_verbose_cutoff_time_chunks(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 feature_matrix = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time, chunk_size="cutoff time", verbose=True) assert (feature_matrix == labels).values.all()
def test_parallel_failure_raises_correct_error(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 with pytest.raises(AssertionError): calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, n_jobs=0, approximate='1 hour')
def test_make_agg_feat_multiple_dtypes(entityset, backend): compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero' agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions'], where=compare_prod) agg_feat2 = Mode(entityset['log']['product_id'], parent_entity=entityset['sessions'], where=compare_prod) pandas_backend = backend([agg_feat, agg_feat2]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert (v == 3) assert (v2 == 'coke zero')
def test_make_agg_feat_where_count_or_device_type_feat(entityset, backend): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ Count.max_stack_depth = 2 log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) compare_count = log_count_feat > 1 compare_device_type = IdentityFeature(entityset['sessions']['device_type']) == 1 or_feat = compare_count.OR(compare_device_type) feat = Count(entityset['sessions']['id'], parent_entity=entityset['customers'], where=or_feat) pandas_backend = backend([feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) name = feat.get_name() instances = df[name] assert (instances[0] == 3)