def test_partial() -> None: """ Add another table to existing repo using partial apply API. Make sure both the table applied via CLI apply and the new table are passing RW test. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py"), "bigquery") as store: driver_locations_source = BigQuerySource( table="feast-oss.public.drivers", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_100 = FeatureView( name="driver_locations_100", entities=["driver"], ttl=timedelta(days=1), schema=[ Field(name="lat", dtype=Float32), Field(name="lon", dtype=String), Field(name="name", dtype=String), ], online=True, batch_source=driver_locations_source, tags={}, ) store.apply([driver_locations_100]) basic_rw_test(store, view_name="driver_locations") basic_rw_test(store, view_name="driver_locations_100")
def test_basic(self) -> None: """ Add another table to existing repo using partial apply API. Make sure both the table applied via CLI apply and the new table are passing RW test. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py")) as store: driver_locations_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.drivers", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_100 = FeatureView( name="driver_locations_100", entities=["driver"], ttl=Duration(seconds=86400 * 1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), Feature(name="name", dtype=ValueType.STRING), ], online=True, input=driver_locations_source, tags={}, ) store.apply([driver_locations_100]) basic_rw_test(store, view_name="driver_locations") basic_rw_test(store, view_name="driver_locations_100")
def store_offline(feature_store: FeatureStore, dataframe: FlyteSchema) -> FeatureStore: horse_colic_entity = Entity(name="Hospital Number", value_type=ValueType.STRING) horse_colic_feature_view = FeatureView( name="horse_colic_stats", entities=["Hospital Number"], features=[ Feature(name="rectal temperature", dtype=ValueType.FLOAT), Feature(name="total protein", dtype=ValueType.FLOAT), Feature(name="peripheral pulse", dtype=ValueType.FLOAT), Feature(name="surgical lesion", dtype=ValueType.STRING), Feature(name="abdominal distension", dtype=ValueType.FLOAT), Feature(name="nasogastric tube", dtype=ValueType.STRING), Feature(name="outcome", dtype=ValueType.STRING), Feature(name="packed cell volume", dtype=ValueType.FLOAT), Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT), ], batch_source=FileSource( path=str(dataframe.remote_path), event_timestamp_column="timestamp", ), ttl=timedelta(days=1), ) # Ingest the data into feast feature_store.apply([horse_colic_entity, horse_colic_feature_view]) return feature_store
def setup_feature_store(): """Prepares the local environment for a FeatureStore docstring test.""" from datetime import datetime, timedelta from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType from feast.repo_operations import init_repo init_repo("feature_repo", "local") fs = FeatureStore(repo_path="feature_repo") driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) driver_hourly_stats = FileSource( path="feature_repo/data/driver_stats.parquet", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=86400 * 1), features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], batch_source=driver_hourly_stats, ) fs.apply([driver_hourly_stats_view, driver]) fs.materialize( start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10), )
def create_order_feature_view(source, infer_features: bool = False): return FeatureView( name="order", entities=["driver", "customer_id"], schema=None if infer_features else [Field(name="order_is_success", dtype=Int32)], source=source, ttl=timedelta(days=2), )
def create_order_feature_view(source, infer_features: bool = False): return FeatureView( name="order", entities=["driver", "customer_id"], features=None if infer_features else [Feature(name="order_is_success", dtype=ValueType.INT32)], batch_source=source, ttl=timedelta(days=2), )
def driver_feature_view(data_source: DataSource, name="test_correctness") -> FeatureView: return FeatureView( name=name, entities=["driver"], features=[Feature("value", ValueType.FLOAT)], ttl=timedelta(days=5), input=data_source, )
def create_field_mapping_feature_view(source): return FeatureView( name="field_mapping", entities=[], # Test that Features still work for FeatureViews. features=[Feature(name="feature_name", dtype=ValueType.INT32)], source=source, ttl=timedelta(days=2), )
def create_location_stats_feature_view(source, infer_features: bool = False): location_stats_feature_view = FeatureView( name="location_stats", entities=[location()], schema=None if infer_features else [Field(name="temperature", dtype=Int32)], source=source, ttl=timedelta(days=2), ) return location_stats_feature_view
def create_location_stats_feature_view(source, infer_features: bool = False): location_stats_feature_view = FeatureView( name="location_stats", entities=["location_id"], features=None if infer_features else [Feature(name="temperature", dtype=ValueType.INT32)], batch_source=source, ttl=timedelta(days=2), ) return location_stats_feature_view
def create_item_embeddings_feature_view(source, infer_features: bool = False): item_embeddings_feature_view = FeatureView( name="item_embeddings", entities=["item"], schema=None if infer_features else [ Field(name="embedding_double", dtype=Array(Float64)), Field(name="embedding_float", dtype=Array(Float32)), ], batch_source=source, ttl=timedelta(hours=2), ) return item_embeddings_feature_view
def create_global_stats_feature_view(source, infer_features: bool = False): global_stats_feature_view = FeatureView( name="global_stats", entities=[], features=None if infer_features else [ Feature(name="num_rides", dtype=ValueType.INT32), Feature(name="avg_ride_length", dtype=ValueType.FLOAT), ], batch_source=source, ttl=timedelta(days=2), ) return global_stats_feature_view
def create_item_embeddings_feature_view(source, infer_features: bool = False): item_embeddings_feature_view = FeatureView( name="item_embeddings", entities=["item"], features=None if infer_features else [ Feature(name="embedding_double", dtype=ValueType.DOUBLE_LIST), Feature(name="embedding_float", dtype=ValueType.FLOAT_LIST), ], batch_source=source, ttl=timedelta(hours=2), ) return item_embeddings_feature_view
def create_customer_daily_profile_feature_view(source): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], features=[ Feature(name="current_balance", dtype=ValueType.FLOAT), Feature(name="avg_passenger_count", dtype=ValueType.FLOAT), Feature(name="lifetime_trip_count", dtype=ValueType.INT32), ], batch_source=source, ttl=timedelta(days=2), ) return customer_profile_feature_view
def driver_feature_view( data_source: DataSource, name="test_correctness", infer_features: bool = False, dtype: FeastType = Float32, ) -> FeatureView: return FeatureView( name=name, entities=["driver"], schema=None if infer_features else [Field(name="value", dtype=dtype)], ttl=timedelta(days=5), source=data_source, )
def create_pushable_feature_view(batch_source: DataSource): push_source = PushSource( name="location_stats_push_source", batch_source=batch_source, ) return FeatureView( name="pushable_location_stats", entities=["location_id"], # Test that Features still work for FeatureViews. features=[Feature(name="temperature", dtype=ValueType.INT32)], ttl=timedelta(days=2), source=push_source, )
def create_driver_hourly_stats_feature_view(source): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], features=[ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT32), ], batch_source=source, ttl=timedelta(hours=2), ) return driver_stats_feature_view
def global_feature_view( data_source: DataSource, name="test_entityless", infer_features: bool = False, value_type: ValueType = ValueType.INT32, ) -> FeatureView: return FeatureView( name=name, entities=[], features=None if infer_features else [Feature("entityless_value", value_type)], ttl=timedelta(days=5), input=data_source, )
def driver_feature_view( data_source: DataSource, name="test_correctness", infer_features: bool = False, value_type: ValueType = ValueType.FLOAT, ) -> FeatureView: return FeatureView( name=name, entities=["driver"], features=None if infer_features else [Feature("value", value_type)], ttl=timedelta(days=5), input=data_source, )
def create_driver_hourly_stats_feature_view(source, infer_features: bool = False): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], schema=None if infer_features else [ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int32), ], source=source, ttl=timedelta(hours=2), ) return driver_stats_feature_view
def create_customer_daily_profile_feature_view(source, infer_features: bool = False): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], schema=None if infer_features else [ Field(name="current_balance", dtype=Float32), Field(name="avg_passenger_count", dtype=Float32), Field(name="lifetime_trip_count", dtype=Int32), ], source=source, ttl=timedelta(days=2), ) return customer_profile_feature_view
def global_feature_view( data_source: DataSource, name="test_entityless", infer_features: bool = False, value_type: ValueType = ValueType.INT32, ) -> FeatureView: return FeatureView( name=name, entities=[], # Test that Features still work for FeatureViews. features=None if infer_features else [Feature(name="entityless_value", dtype=value_type)], ttl=timedelta(days=5), source=data_source, )
# Feature views are a grouping based on how features are stored in either the # online or offline store. driver_stats_fv = FeatureView( # The unique name of this feature view. Two feature views in a single # project cannot have the same name name="driver_hourly_stats", # The list of entities specifies the keys required for joining or looking # up features from this feature view. The reference provided in this field # correspond to the name of a defined entity (or entities) entities=["driver"], # The timedelta is the maximum age that each feature value may have # relative to its lookup time. For historical features (used in training), # TTL is relative to each timestamp provided in the entity dataframe. # TTL also allows for eviction of keys from online stores and limits the # amount of historical scanning required for historical feature values # during retrieval ttl=timedelta(weeks=52), # The list of features defined below act as a schema to both define features # for both materialization of features into a store, and are used as references # during retrieval for building a training dataset or serving features schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], # Batch sources are used to find feature values. In the case of this feature # view we will query a source table on Redshift for driver statistics # features batch_source=driver_stats_source, )
# Define an entity for the driver. You can think of entity as a primary key used to # fetch features. driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 365), features=[ Feature(name="conv_rate", dtype=ValueType.DOUBLE), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) fs = FeatureStore("") fs.apply([driver_hourly_stats_view, driver]) now = datetime.now() fs.materialize_incremental(now)
# if its parquet, it can just be a folder of parquet files, based on the parquet # format - then you can keep appending to the folder as required. batch_source = FileSource( path="/home/chapman/Documents/feast-start/feature_multi/data/events", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) # Define an entity for the driver. You can think of entity as a primary key used to # fetch features. customer = Entity( name="user_id", value_type=ValueType.INT64, description="customer id for transactions", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. customer_events = FeatureView( name="customer_events", entities=["user_id"], ttl=Duration(seconds=86400 * 1), features=[ Feature(name="event", dtype=ValueType.STRING), ], online=True, input=batch_source, tags={}, )
from datetime import timedelta from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType nonexistent_source = BigQuerySource( table_ref="project.dataset.nonexistent_table", event_timestamp_column="" ) driver = Entity(name="driver", value_type=ValueType.INT64, description="driver id",) nonexistent_features = FeatureView( name="driver_locations", entities=["driver"], ttl=timedelta(days=1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), ], input=nonexistent_source, )
def test_write_to_online_store_event_check(local_redis_environment): if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": return fs = local_redis_environment.feature_store # write same data points 3 with different timestamps now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") hour_ago = pd.Timestamp(datetime.datetime.utcnow() - timedelta(hours=1)).round("ms") latest = pd.Timestamp(datetime.datetime.utcnow() + timedelta(seconds=1)).round("ms") data = { "id": [123, 567, 890], "string_col": ["OLD_FEATURE", "LATEST_VALUE2", "LATEST_VALUE3"], "ts_1": [hour_ago, now, now], } dataframe_source = pd.DataFrame(data) with prep_file_source(df=dataframe_source, event_timestamp_column="ts_1") as file_source: e = Entity(name="id", value_type=ValueType.STRING) # Create Feature View fv1 = FeatureView( name="feature_view_123", schema=[Field(name="string_col", dtype=String)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View and Entity fs.apply([fv1, e]) # data to ingest into Online Store (recent) data = { "id": [123], "string_col": ["hi_123"], "ts_1": [now], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features(features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }]).to_df() assert df["string_col"].iloc[0] == "hi_123" # data to ingest into Online Store (1 hour delayed data) # should now overwrite features for id=123 because it's less recent data data = { "id": [123, 567, 890], "string_col": ["bye_321", "hello_123", "greetings_321"], "ts_1": [hour_ago, hour_ago, hour_ago], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "hi_123" assert df["string_col"].iloc[1] == "hello_123" assert df["string_col"].iloc[2] == "greetings_321" # should overwrite string_col for id=123 because it's most recent based on event_timestamp data = { "id": [123], "string_col": ["LATEST_VALUE"], "ts_1": [latest], } df_data = pd.DataFrame(data) fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "LATEST_VALUE" assert df["string_col"].iloc[1] == "hello_123" assert df["string_col"].iloc[2] == "greetings_321" # writes to online store via datasource (dataframe_source) materialization fs.materialize( start_date=datetime.datetime.now() - timedelta(hours=12), end_date=datetime.datetime.utcnow(), ) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "LATEST_VALUE" assert df["string_col"].iloc[1] == "LATEST_VALUE2" assert df["string_col"].iloc[2] == "LATEST_VALUE3"
driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 7), features=[ Feature(name="conv_rate", dtype=ValueType.DOUBLE), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) # For Benchmarks # Please read more in Feast RFC-031 (link https://docs.google.com/document/d/12UuvTQnTTCJhdRgy6h10zSbInNGSyEJkIxpOcgOen1I/edit) # about this benchmark setup def generate_data(num_rows: int, num_features: int, key_space: int, destination: str) -> pd.DataFrame: features = [f"feature_{i}" for i in range(num_features)] columns = ["entity", "event_timestamp"] + features
value_type=ValueType.INT64, description="driver id", ) customer = Entity( name="customer", # The name is derived from this argument, not object name. value_type=ValueType.STRING, ) driver_locations = FeatureView( name="driver_locations", entities=["driver"], ttl=timedelta(days=1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), ], online=True, batch_source=driver_locations_source, tags={}, ) customer_profile = FeatureView( name="customer_profile", entities=["customer"], ttl=timedelta(days=1), features=[ Feature(name="avg_orders_day", dtype=ValueType.FLOAT), Feature(name="name", dtype=ValueType.STRING), Feature(name="age", dtype=ValueType.INT64), ],
customer_daily_profile = SparkSource( name="customer_daily_profile", path=f"{CURRENT_DIR}/data/customer_daily_profile.parquet", file_format="parquet", timestamp_field="event_timestamp", created_timestamp_column="created", ) # Feature Views driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver"], ttl=timedelta(days=7), schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], online=True, source=driver_hourly_stats, tags={}, ) customer_daily_profile_view = FeatureView( name="customer_daily_profile", entities=["customer"], ttl=timedelta(days=7), schema=[ Field(name="current_balance", dtype=Float32), Field(name="avg_passenger_count", dtype=Float32), Field(name="lifetime_trip_count", dtype=Int64), ],