def test_partial() -> None: """ Add another table to existing repo using partial apply API. Make sure both the table applied via CLI apply and the new table are passing RW test. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py"), "bigquery") as store: driver_locations_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.drivers", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_100 = FeatureView( name="driver_locations_100", entities=["driver"], ttl=Duration(seconds=86400 * 1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), Feature(name="name", dtype=ValueType.STRING), ], online=True, input=driver_locations_source, tags={}, ) store.apply([driver_locations_100]) basic_rw_test(store, view_name="driver_locations") basic_rw_test(store, view_name="driver_locations_100")
def test_e2e_local() -> None: """ A more comprehensive than "basic" test, using local provider. 1. Create a repo. 2. Apply 3. Ingest some data to online store from parquet 4. Read from the online store to make sure it made it there. """ runner = CliRunner() with tempfile.TemporaryDirectory() as data_dir: # Generate some test data in parquet format. end_date = datetime.now().replace(microsecond=0, second=0, minute=0) start_date = end_date - timedelta(days=15) driver_entities = [1001, 1002, 1003, 1004, 1005] driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date) driver_stats_path = os.path.join(data_dir, "driver_stats.parquet") driver_df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True) # Note that runner takes care of running apply/teardown for us here. # We patch python code in example_feature_repo_2.py to set the path to Parquet files. with runner.local_repo( get_example_repo("example_feature_repo_2.py").replace( "%PARQUET_PATH%", driver_stats_path), "file", ) as store: assert store.repo_path is not None # feast materialize r = runner.run( [ "materialize", start_date.isoformat(), (end_date - timedelta(days=7)).isoformat(), ], cwd=Path(store.repo_path), ) assert r.returncode == 0 _assert_online_features(store, driver_df, end_date - timedelta(days=7)) # feast materialize-incremental r = runner.run( ["materialize-incremental", end_date.isoformat()], cwd=Path(store.repo_path), ) assert r.returncode == 0 _assert_online_features(store, driver_df, end_date)
def test_online() -> None: """ Test reading from the online store in local mode. """ runner = CliRunner() with runner.local_repo( get_example_repo("example_feature_repo_1.py")) as store: # Write some data to two tables driver_locations_fv = store.get_feature_view(name="driver_locations") customer_profile_fv = store.get_feature_view(name="customer_profile") customer_driver_combined_fv = store.get_feature_view( name="customer_driver_combined") provider = store._get_provider() driver_key = EntityKeyProto(join_keys=["driver"], entity_values=[ValueProto(int64_val=1)]) provider.online_write_batch( project=store.config.project, table=driver_locations_fv, data=[( driver_key, { "lat": ValueProto(double_val=0.1), "lon": ValueProto(string_val="1.0"), }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) customer_key = EntityKeyProto(join_keys=["customer"], entity_values=[ValueProto(int64_val=5)]) provider.online_write_batch( project=store.config.project, table=customer_profile_fv, data=[( customer_key, { "avg_orders_day": ValueProto(float_val=1.0), "name": ValueProto(string_val="John"), "age": ValueProto(int64_val=3), }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) customer_key = EntityKeyProto( join_keys=["customer", "driver"], entity_values=[ValueProto(int64_val=5), ValueProto(int64_val=1)], ) provider.online_write_batch( project=store.config.project, table=customer_driver_combined_fv, data=[( customer_key, { "trips": ValueProto(int64_val=7) }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) # Retrieve two features using two keys, one valid one non-existing result = store.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }, { "driver": 1, "customer": 5 }], ).to_dict() assert "driver_locations__lon" in result assert "customer_profile__avg_orders_day" in result assert "customer_profile__name" in result assert result["driver"] == [1, 1] assert result["customer"] == [5, 5] assert result["driver_locations__lon"] == ["1.0", "1.0"] assert result["customer_profile__avg_orders_day"] == [1.0, 1.0] assert result["customer_profile__name"] == ["John", "John"] assert result["customer_driver_combined__trips"] == [7, 7] # Ensure features are still in result when keys not found result = store.get_online_features( feature_refs=["customer_driver_combined:trips"], entity_rows=[{ "driver": 0, "customer": 0 }], ).to_dict() assert "customer_driver_combined__trips" in result # invalid table reference with pytest.raises(FeatureViewNotFoundException): store.get_online_features( feature_refs=["driver_locations_bad:lon"], entity_rows=[{ "driver": 1 }], ) # Create new FeatureStore object with fast cache invalidation cache_ttl = 1 fs_fast_ttl = FeatureStore(config=RepoConfig( registry=RegistryConfig(path=store.config.registry, cache_ttl_seconds=cache_ttl), online_store=store.config.online_store, project=store.config.project, provider=store.config.provider, )) # Should download the registry and cache it permanently (or until manually refreshed) result = fs_fast_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Rename the registry.db so that it cant be used for refreshes os.rename(store.config.registry, store.config.registry + "_fake") # Wait for registry to expire time.sleep(cache_ttl) # Will try to reload registry because it has expired (it will fail because we deleted the actual registry file) with pytest.raises(FileNotFoundError): fs_fast_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() # Restore registry.db so that we can see if it actually reloads registry os.rename(store.config.registry + "_fake", store.config.registry) # Test if registry is actually reloaded and whether results return result = fs_fast_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Create a registry with infinite cache (for users that want to manually refresh the registry) fs_infinite_ttl = FeatureStore(config=RepoConfig( registry=RegistryConfig(path=store.config.registry, cache_ttl_seconds=0), online_store=store.config.online_store, project=store.config.project, provider=store.config.provider, )) # Should return results (and fill the registry cache) result = fs_infinite_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Wait a bit so that an arbitrary TTL would take effect time.sleep(2) # Rename the registry.db so that it cant be used for refreshes os.rename(store.config.registry, store.config.registry + "_fake") # TTL is infinite so this method should use registry cache result = fs_infinite_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Force registry reload (should fail because file is missing) with pytest.raises(FileNotFoundError): fs_infinite_ttl.refresh_registry() # Restore registry.db so that teardown works os.rename(store.config.registry + "_fake", store.config.registry)
def test_online() -> None: """ Test reading from the online store in local mode. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py")) as store: # Write some data to two tables registry = store._get_registry() table = registry.get_feature_view( project=store.config.project, name="driver_locations" ) table_2 = registry.get_feature_view( project=store.config.project, name="driver_locations_2" ) provider = store._get_provider() entity_key = EntityKeyProto( entity_names=["driver"], entity_values=[ValueProto(int64_val=1)] ) provider.online_write_batch( project=store.config.project, table=table, data=[ ( entity_key, { "lat": ValueProto(double_val=0.1), "lon": ValueProto(string_val="1.0"), }, datetime.utcnow(), datetime.utcnow(), ) ], progress=None, ) provider.online_write_batch( project=store.config.project, table=table_2, data=[ ( entity_key, { "lat": ValueProto(double_val=2.0), "lon": ValueProto(string_val="2.0"), }, datetime.utcnow(), datetime.utcnow(), ) ], progress=None, ) # Retrieve two features using two keys, one valid one non-existing result = store.get_online_features( feature_refs=["driver_locations:lon", "driver_locations_2:lon"], entity_rows=[{"driver": 1}, {"driver": 123}], ) assert "driver_locations:lon" in result.to_dict() assert result.to_dict()["driver_locations:lon"] == ["1.0", None] assert result.to_dict()["driver_locations_2:lon"] == ["2.0", None] # invalid table reference with pytest.raises(ValueError): store.get_online_features( feature_refs=["driver_locations_bad:lon"], entity_rows=[{"driver": 1}], )
def test_online_to_df(): """ Test dataframe conversion. Make sure the response columns and rows are the same order as the request. """ driver_ids = [1, 2, 3] customer_ids = [4, 5, 6] name = "foo" lon_multiply = 1.0 lat_multiply = 0.1 age_multiply = 10 avg_order_day_multiply = 1.0 runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py"), "bigquery") as store: # Write three tables to online store driver_locations_fv = store.get_feature_view(name="driver_locations") customer_profile_fv = store.get_feature_view(name="customer_profile") customer_driver_combined_fv = store.get_feature_view( name="customer_driver_combined") provider = store._get_provider() for (d, c) in zip(driver_ids, customer_ids): """ driver table: driver driver_locations__lon driver_locations__lat 1 1.0 0.1 2 2.0 0.2 3 3.0 0.3 """ driver_key = EntityKeyProto( join_keys=["driver"], entity_values=[ValueProto(int64_val=d)]) provider.online_write_batch( config=store.config, table=driver_locations_fv, data=[( driver_key, { "lat": ValueProto(double_val=d * lat_multiply), "lon": ValueProto(string_val=str(d * lon_multiply)), }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) """ customer table customer customer_profile__avg_orders_day customer_profile__name customer_profile__age 4 4.0 foo4 40 5 5.0 foo5 50 6 6.0 foo6 60 """ customer_key = EntityKeyProto( join_keys=["customer"], entity_values=[ValueProto(int64_val=c)]) provider.online_write_batch( config=store.config, table=customer_profile_fv, data=[( customer_key, { "avg_orders_day": ValueProto(float_val=c * avg_order_day_multiply), "name": ValueProto(string_val=name + str(c)), "age": ValueProto(int64_val=c * age_multiply), }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) """ customer_driver_combined table customer driver customer_driver_combined__trips 4 1 4 5 2 10 6 3 18 """ combo_keys = EntityKeyProto( join_keys=["customer", "driver"], entity_values=[ ValueProto(int64_val=c), ValueProto(int64_val=d) ], ) provider.online_write_batch( config=store.config, table=customer_driver_combined_fv, data=[( combo_keys, { "trips": ValueProto(int64_val=c * d) }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) # Get online features in dataframe result_df = store.get_online_features( feature_refs=[ "driver_locations:lon", "driver_locations:lat", "customer_profile:avg_orders_day", "customer_profile:name", "customer_profile:age", "customer_driver_combined:trips", ], # Reverse the row order entity_rows=[{ "driver": d, "customer": c } for (d, c) in zip(reversed(driver_ids), reversed(customer_ids))], ).to_df() """ Construct the expected dataframe with reversed row order like so: driver customer driver_locations__lon driver_locations__lat customer_profile__avg_orders_day customer_profile__name customer_profile__age customer_driver_combined__trips 3 6 3.0 0.3 6.0 foo6 60 18 2 5 2.0 0.2 5.0 foo5 50 10 1 4 1.0 0.1 4.0 foo4 40 4 """ df_dict = { "driver": driver_ids, "customer": customer_ids, "driver_locations__lon": [str(d * lon_multiply) for d in driver_ids], "driver_locations__lat": [d * lat_multiply for d in driver_ids], "customer_profile__avg_orders_day": [c * avg_order_day_multiply for c in customer_ids], "customer_profile__name": [name + str(c) for c in customer_ids], "customer_profile__age": [c * age_multiply for c in customer_ids], "customer_driver_combined__trips": [d * c for (d, c) in zip(driver_ids, customer_ids)], } # Requested column order ordered_column = [ "driver", "customer", "driver_locations__lon", "driver_locations__lat", "customer_profile__avg_orders_day", "customer_profile__name", "customer_profile__age", "customer_driver_combined__trips", ] expected_df = pd.DataFrame( {k: reversed(v) for (k, v) in df_dict.items()}) assert_frame_equal(result_df[ordered_column], expected_df)