def retrieve_online(feature_store: FeatureStore, dataset: pd.DataFrame) -> dict: inference_data = random.choice(dataset["Hospital Number"]) logger.info(f"Hospital Number chosen for inference is: {inference_data}") entity_rows = [{"Hospital Number": inference_data}] return feature_store.get_online_features(FEAST_FEATURES, entity_rows)
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to AWS print("Deploying feature store to AWS...") fs.apply([driver, driver_hourly_stats_view]) # Select features feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print("Retrieving online features...") # Retrieve features from the online store (DynamoDB) online_features = fs.get_online_features( feature_refs=feature_refs, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], ).to_dict() print(pd.DataFrame.from_dict(online_features))
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime): """Assert that features in online store are up to date with `max_date` date.""" # Read features back response = store.get_online_features( features=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:avg_daily_trips", "global_daily_stats:num_rides", "global_daily_stats:avg_ride_length", ], entity_rows=[{ "driver_id": 1001 }], full_feature_names=True, ) # Float features should still be floats from the online store... assert (response.proto.results[list( response.proto.metadata.feature_names.val).index( "driver_hourly_stats__conv_rate")].values[0].float_val > 0) result = response.to_dict() assert len(result) == 5 assert "driver_hourly_stats__avg_daily_trips" in result assert "driver_hourly_stats__conv_rate" in result assert (abs(result["driver_hourly_stats__conv_rate"][0] - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"]) < 0.01) assert "global_daily_stats__num_rides" in result assert "global_daily_stats__avg_ride_length" in result
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime): """Assert that features in online store are up to date with `max_date` date.""" # Read features back result = store.get_online_features( features=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:avg_daily_trips", "global_daily_stats:num_rides", "global_daily_stats:avg_ride_length", ], entity_rows=[{ "driver_id": 1001 }], full_feature_names=True, ).to_dict() assert len(result) == 5 assert "driver_hourly_stats__avg_daily_trips" in result assert "driver_hourly_stats__conv_rate" in result assert (abs(result["driver_hourly_stats__conv_rate"][0] - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"]) < 0.01) assert "global_daily_stats__num_rides" in result assert "global_daily_stats__avg_ride_length" in result
def check_offline_and_online_features( fs: FeatureStore, fv: FeatureView, driver_id: int, event_timestamp: datetime, expected_value: Optional[float], full_feature_names: bool, check_offline_store: bool = True, ) -> None: # Check online store response_dict = fs.get_online_features( [f"{fv.name}:value"], [{ "driver_id": driver_id }], full_feature_names=full_feature_names, ).to_dict() if full_feature_names: if expected_value: assert ( abs(response_dict[f"{fv.name}__value"][0] - expected_value) < 1e-6), f"Response: {response_dict}, Expected: {expected_value}" else: assert response_dict[f"{fv.name}__value"][0] is None else: if expected_value: assert (abs(response_dict["value"][0] - expected_value) < 1e-6 ), f"Response: {response_dict}, Expected: {expected_value}" else: assert response_dict["value"][0] is None # Check offline store if check_offline_store: df = fs.get_historical_features( entity_df=pd.DataFrame.from_dict({ "driver_id": [driver_id], "event_timestamp": [event_timestamp] }), features=[f"{fv.name}:value"], full_feature_names=full_feature_names, ).to_df() if full_feature_names: if expected_value: assert (abs( df.to_dict(orient="list")[f"{fv.name}__value"][0] - expected_value) < 1e-6) else: assert not df.to_dict( orient="list")[f"{fv.name}__value"] or math.isnan( df.to_dict(orient="list")[f"{fv.name}__value"][0]) else: if expected_value: assert (abs( df.to_dict(orient="list")["value"][0] - expected_value) < 1e-6) else: assert not df.to_dict(orient="list")["value"] or math.isnan( df.to_dict(orient="list")["value"][0])
class DriverRankingModel: def __init__(self): # Load model self.model = load("driver_model.bin") # Set up feature store self.fs = FeatureStore(repo_path="driver_ranking/") def predict(self, driver_ids): # Read features from Feast driver_features = self.fs.get_online_features( entity_rows=[{ "driver_id": driver_id } for driver_id in driver_ids], feature_refs=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate", "driver_hourly_stats:avg_daily_trips", ], ) features_df = pd.DataFrame.from_dict(driver_features.to_dict()) # Make prediction features_df["prediction"] = self.model.predict(features_df) # Choose best driver best_driver_id = features_df["driver_id"].iloc[ features_df["prediction"].argmax()] # return best driver return best_driver_id
def main(): pd.set_option("display.max_columns", None) pd.set_option("display.width", 1000) # Load the feature store from the current path fs = FeatureStore(repo_path=".") # Deploy the feature store to Snowflake print("Deploying feature store to Snowflake...") fs.apply([driver, driver_stats_fv]) # Select features features = [ "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate" ] # Create an entity dataframe. This is the dataframe that will be enriched with historical features entity_df = pd.DataFrame({ "event_timestamp": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( start=datetime.now() - timedelta(days=3), end=datetime.now(), periods=3, ) ], "driver_id": [1001, 1002, 1003], }) print("Retrieving training data...") # Retrieve historical features by joining the entity dataframe to the Snowflake table source training_df = fs.get_historical_features(features=features, entity_df=entity_df).to_df() print() print(training_df) print() print("Loading features into the online store...") fs.materialize_incremental(end_date=datetime.now()) print() print("Retrieving online features...") # Retrieve features from the online store online_features = fs.get_online_features( features=features, entity_rows=[{ "driver_id": 1001 }, { "driver_id": 1002 }], ).to_dict() print() print(pd.DataFrame.from_dict(online_features))
def run_demo(): store = FeatureStore(repo_path=".") print("--- Historical features (from saved dataset) ---") ds = store.get_saved_dataset("my_training_ds") print(ds.to_df()) print("\n--- Online features ---") features = store.get_online_features( features=store.get_feature_service("credit_score_v3"), entity_rows=[ {"zipcode": 30721, "dob_ssn": "19530219_5179", "transaction_amt": 1023} ], ).to_dict() for key, value in sorted(features.items()): print(key, " : ", value)
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame, max_date: datetime): """Assert that features in online store are up to date with `max_date` date.""" # Read features back result = store.get_online_features( feature_refs=[ "driver_hourly_stats:conv_rate", "driver_hourly_stats:avg_daily_trips", ], entity_rows=[{ "driver_id": 1001 }], ) assert "driver_hourly_stats__avg_daily_trips" in result.to_dict() assert "driver_hourly_stats__conv_rate" in result.to_dict() assert (abs(result.to_dict()["driver_hourly_stats__conv_rate"][0] - _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"]) < 0.01)
def test_online() -> None: """ Test reading from the online store in local mode. """ runner = CliRunner() with runner.local_repo( get_example_repo("example_feature_repo_1.py")) as store: # Write some data to two tables driver_locations_fv = store.get_feature_view(name="driver_locations") customer_profile_fv = store.get_feature_view(name="customer_profile") customer_driver_combined_fv = store.get_feature_view( name="customer_driver_combined") provider = store._get_provider() driver_key = EntityKeyProto(join_keys=["driver"], entity_values=[ValueProto(int64_val=1)]) provider.online_write_batch( project=store.config.project, table=driver_locations_fv, data=[( driver_key, { "lat": ValueProto(double_val=0.1), "lon": ValueProto(string_val="1.0"), }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) customer_key = EntityKeyProto(join_keys=["customer"], entity_values=[ValueProto(int64_val=5)]) provider.online_write_batch( project=store.config.project, table=customer_profile_fv, data=[( customer_key, { "avg_orders_day": ValueProto(float_val=1.0), "name": ValueProto(string_val="John"), "age": ValueProto(int64_val=3), }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) customer_key = EntityKeyProto( join_keys=["customer", "driver"], entity_values=[ValueProto(int64_val=5), ValueProto(int64_val=1)], ) provider.online_write_batch( project=store.config.project, table=customer_driver_combined_fv, data=[( customer_key, { "trips": ValueProto(int64_val=7) }, datetime.utcnow(), datetime.utcnow(), )], progress=None, ) # Retrieve two features using two keys, one valid one non-existing result = store.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }, { "driver": 1, "customer": 5 }], ).to_dict() assert "driver_locations__lon" in result assert "customer_profile__avg_orders_day" in result assert "customer_profile__name" in result assert result["driver"] == [1, 1] assert result["customer"] == [5, 5] assert result["driver_locations__lon"] == ["1.0", "1.0"] assert result["customer_profile__avg_orders_day"] == [1.0, 1.0] assert result["customer_profile__name"] == ["John", "John"] assert result["customer_driver_combined__trips"] == [7, 7] # Ensure features are still in result when keys not found result = store.get_online_features( feature_refs=["customer_driver_combined:trips"], entity_rows=[{ "driver": 0, "customer": 0 }], ).to_dict() assert "customer_driver_combined__trips" in result # invalid table reference with pytest.raises(FeatureViewNotFoundException): store.get_online_features( feature_refs=["driver_locations_bad:lon"], entity_rows=[{ "driver": 1 }], ) # Create new FeatureStore object with fast cache invalidation cache_ttl = 1 fs_fast_ttl = FeatureStore(config=RepoConfig( registry=RegistryConfig(path=store.config.registry, cache_ttl_seconds=cache_ttl), online_store=store.config.online_store, project=store.config.project, provider=store.config.provider, )) # Should download the registry and cache it permanently (or until manually refreshed) result = fs_fast_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Rename the registry.db so that it cant be used for refreshes os.rename(store.config.registry, store.config.registry + "_fake") # Wait for registry to expire time.sleep(cache_ttl) # Will try to reload registry because it has expired (it will fail because we deleted the actual registry file) with pytest.raises(FileNotFoundError): fs_fast_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() # Restore registry.db so that we can see if it actually reloads registry os.rename(store.config.registry + "_fake", store.config.registry) # Test if registry is actually reloaded and whether results return result = fs_fast_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Create a registry with infinite cache (for users that want to manually refresh the registry) fs_infinite_ttl = FeatureStore(config=RepoConfig( registry=RegistryConfig(path=store.config.registry, cache_ttl_seconds=0), online_store=store.config.online_store, project=store.config.project, provider=store.config.provider, )) # Should return results (and fill the registry cache) result = fs_infinite_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Wait a bit so that an arbitrary TTL would take effect time.sleep(2) # Rename the registry.db so that it cant be used for refreshes os.rename(store.config.registry, store.config.registry + "_fake") # TTL is infinite so this method should use registry cache result = fs_infinite_ttl.get_online_features( feature_refs=[ "driver_locations:lon", "customer_profile:avg_orders_day", "customer_profile:name", "customer_driver_combined:trips", ], entity_rows=[{ "driver": 1, "customer": 5 }], ).to_dict() assert result["driver_locations__lon"] == ["1.0"] assert result["customer_driver_combined__trips"] == [7] # Force registry reload (should fail because file is missing) with pytest.raises(FileNotFoundError): fs_infinite_ttl.refresh_registry() # Restore registry.db so that teardown works os.rename(store.config.registry + "_fake", store.config.registry)