def test_partial() -> None: """ Add another table to existing repo using partial apply API. Make sure both the table applied via CLI apply and the new table are passing RW test. """ runner = CliRunner() with runner.local_repo(get_example_repo("example_feature_repo_1.py"), "bigquery") as store: driver_locations_source = BigQuerySource( table="feast-oss.public.drivers", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) driver_locations_100 = FeatureView( name="driver_locations_100", entities=["driver"], ttl=timedelta(days=1), schema=[ Field(name="lat", dtype=Float32), Field(name="lon", dtype=String), Field(name="name", dtype=String), ], online=True, batch_source=driver_locations_source, tags={}, ) store.apply([driver_locations_100]) basic_rw_test(store, view_name="driver_locations") basic_rw_test(store, view_name="driver_locations_100")
def create_item_embeddings_feature_view(source, infer_features: bool = False): item_embeddings_feature_view = FeatureView( name="item_embeddings", entities=["item"], schema=None if infer_features else [ Field(name="embedding_double", dtype=Array(Float64)), Field(name="embedding_float", dtype=Array(Float32)), ], batch_source=source, ttl=timedelta(hours=2), ) return item_embeddings_feature_view
def create_customer_daily_profile_feature_view(source, infer_features: bool = False): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], schema=None if infer_features else [ Field(name="current_balance", dtype=Float32), Field(name="avg_passenger_count", dtype=Float32), Field(name="lifetime_trip_count", dtype=Int32), ], source=source, ttl=timedelta(days=2), ) return customer_profile_feature_view
def create_driver_hourly_stats_feature_view(source, infer_features: bool = False): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], schema=None if infer_features else [ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int32), ], source=source, ttl=timedelta(hours=2), ) return driver_stats_feature_view
def conv_rate_plus_100_feature_view( sources: Dict[str, Union[RequestSource, FeatureView]], infer_features: bool = False, features: Optional[List[Field]] = None, ) -> OnDemandFeatureView: # Test that positional arguments and Features still work for ODFVs. _features = features or [ Field(name="conv_rate_plus_100", dtype=Float64), Field(name="conv_rate_plus_val_to_add", dtype=Float64), Field(name="conv_rate_plus_100_rounded", dtype=Int32), ] return OnDemandFeatureView( name=conv_rate_plus_100.__name__, schema=[] if infer_features else _features, sources=sources, udf=conv_rate_plus_100, )
def similarity_feature_view( sources: Dict[str, Union[RequestSource, FeatureView]], infer_features: bool = False, features: Optional[List[Feature]] = None, ) -> OnDemandFeatureView: _fields = [ Field(name="cos_double", dtype=Float64), Field(name="cos_float", dtype=Float32), ] if features is not None: _fields = [Field.from_feature(feature) for feature in features] return OnDemandFeatureView( name=similarity.__name__, sources=sources, schema=[] if infer_features else _fields, udf=similarity, )
def create_order_feature_view(source, infer_features: bool = False): return FeatureView( name="order", entities=["driver", "customer_id"], schema=None if infer_features else [Field(name="order_is_success", dtype=Int32)], source=source, ttl=timedelta(days=2), )
def create_location_stats_feature_view(source, infer_features: bool = False): location_stats_feature_view = FeatureView( name="location_stats", entities=[location()], schema=None if infer_features else [Field(name="temperature", dtype=Int32)], source=source, ttl=timedelta(days=2), ) return location_stats_feature_view
def driver_feature_view( data_source: DataSource, name="test_correctness", infer_features: bool = False, dtype: FeastType = Float32, ) -> FeatureView: return FeatureView( name=name, entities=["driver"], schema=None if infer_features else [Field(name="value", dtype=dtype)], ttl=timedelta(days=5), source=data_source, )
def setup_feature_store(): """Prepares the local environment for a FeatureStore docstring test.""" from datetime import datetime, timedelta from feast import Entity, FeatureStore, FeatureView, Field, FileSource, ValueType from feast.repo_operations import init_repo from feast.types import Float32, Int64 init_repo("feature_repo", "local") fs = FeatureStore(repo_path="feature_repo") driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) driver_hourly_stats = FileSource( path="feature_repo/data/driver_stats.parquet", timestamp_field="event_timestamp", created_timestamp_column="created", ) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(seconds=86400 * 1), schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], batch_source=driver_hourly_stats, ) fs.apply([driver_hourly_stats_view, driver]) fs.materialize( start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10), )
def test_infer_odfv_features_with_error(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources features = [Field(name="conv_rate_plus_200", dtype=Float64)] driver_hourly_stats = create_driver_hourly_stats_batch_feature_view( data_sources.driver) request_source = create_conv_rate_request_source() driver_odfv = conv_rate_plus_100_feature_view( [driver_hourly_stats, request_source], features=features, ) feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()] with pytest.raises(SpecifiedFeaturesNotPresentError): store.apply(feast_objects)
join_keys=["driver_id"], value_type=ValueType.INT64, description="driver id", ) customer = Entity( name="customer", # The name is derived from this argument, not object name. join_keys=["customer_id"], value_type=ValueType.STRING, ) driver_locations = FeatureView( name="driver_locations", entities=["driver"], ttl=timedelta(days=1), schema=[Field(name="lat", dtype=Float32), Field(name="lon", dtype=String)], online=True, batch_source=driver_locations_source, tags={}, ) pushed_driver_locations = FeatureView( name="pushed_driver_locations", entities=["driver"], ttl=timedelta(days=1), schema=[ Field(name="driver_lat", dtype=Float32), Field(name="driver_long", dtype=String), ], online=True,
# Feature views are a grouping based on how features are stored in either the # online or offline store. driver_stats_fv = FeatureView( # The unique name of this feature view. Two feature views in a single # project cannot have the same name name="driver_hourly_stats", # The list of entities specifies the keys required for joining or looking # up features from this feature view. The reference provided in this field # correspond to the name of a defined entity (or entities) entities=["driver"], # The timedelta is the maximum age that each feature value may have # relative to its lookup time. For historical features (used in training), # TTL is relative to each timestamp provided in the entity dataframe. # TTL also allows for eviction of keys from online stores and limits the # amount of historical scanning required for historical feature values # during retrieval ttl=timedelta(weeks=52), # The list of features defined below act as a schema to both define features # for both materialization of features into a store, and are used as references # during retrieval for building a training dataset or serving features schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], # Batch sources are used to find feature values. In the case of this feature # view we will query a source table on Redshift for driver statistics # features batch_source=driver_stats_source, )
def test_write_to_online_store_event_check(local_redis_environment): if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": return fs = local_redis_environment.feature_store # write same data points 3 with different timestamps now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") hour_ago = pd.Timestamp(datetime.datetime.utcnow() - timedelta(hours=1)).round("ms") latest = pd.Timestamp(datetime.datetime.utcnow() + timedelta(seconds=1)).round("ms") data = { "id": [123, 567, 890], "string_col": ["OLD_FEATURE", "LATEST_VALUE2", "LATEST_VALUE3"], "ts_1": [hour_ago, now, now], } dataframe_source = pd.DataFrame(data) with prep_file_source(df=dataframe_source, event_timestamp_column="ts_1") as file_source: e = Entity(name="id", value_type=ValueType.STRING) # Create Feature View fv1 = FeatureView( name="feature_view_123", schema=[Field(name="string_col", dtype=String)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View and Entity fs.apply([fv1, e]) # data to ingest into Online Store (recent) data = { "id": [123], "string_col": ["hi_123"], "ts_1": [now], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features(features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }]).to_df() assert df["string_col"].iloc[0] == "hi_123" # data to ingest into Online Store (1 hour delayed data) # should now overwrite features for id=123 because it's less recent data data = { "id": [123, 567, 890], "string_col": ["bye_321", "hello_123", "greetings_321"], "ts_1": [hour_ago, hour_ago, hour_ago], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "hi_123" assert df["string_col"].iloc[1] == "hello_123" assert df["string_col"].iloc[2] == "greetings_321" # should overwrite string_col for id=123 because it's most recent based on event_timestamp data = { "id": [123], "string_col": ["LATEST_VALUE"], "ts_1": [latest], } df_data = pd.DataFrame(data) fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "LATEST_VALUE" assert df["string_col"].iloc[1] == "hello_123" assert df["string_col"].iloc[2] == "greetings_321" # writes to online store via datasource (dataframe_source) materialization fs.materialize( start_date=datetime.datetime.now() - timedelta(hours=12), end_date=datetime.datetime.utcnow(), ) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "LATEST_VALUE" assert df["string_col"].iloc[1] == "LATEST_VALUE2" assert df["string_col"].iloc[2] == "LATEST_VALUE3"
) customer_daily_profile = SparkSource( name="customer_daily_profile", path=f"{CURRENT_DIR}/data/customer_daily_profile.parquet", file_format="parquet", timestamp_field="event_timestamp", created_timestamp_column="created", ) # Feature Views driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver"], ttl=timedelta(days=7), schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], online=True, source=driver_hourly_stats, tags={}, ) customer_daily_profile_view = FeatureView( name="customer_daily_profile", entities=["customer"], ttl=timedelta(days=7), schema=[ Field(name="current_balance", dtype=Float32), Field(name="avg_passenger_count", dtype=Float32), Field(name="lifetime_trip_count", dtype=Int64),
timestamp_field="event_timestamp", created_timestamp_column="created", ) driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=timedelta(days=1), schema=[ Field(name="conv_rate", dtype=Float32), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) global_daily_stats = FileSource( path="%PARQUET_PATH_GLOBAL%", # placeholder to be replaced by the test timestamp_field="event_timestamp", created_timestamp_column="created", ) global_stats_feature_view = FeatureView(
def create_conv_rate_request_source(): return RequestSource( name="conv_rate_input", schema=[Field(name="val_to_add", dtype=Int32)], )
labels={"owner": "*****@*****.**", "team": "hack week",}, ) zipcode_source = FileSource( name="zipcode", path="data/zipcode_table.parquet", timestamp_field="event_timestamp", created_timestamp_column="created_timestamp", ) zipcode_features = FeatureView( name="zipcode_features", entities=["zipcode"], ttl=timedelta(days=3650), schema=[ Field(name="city", dtype=String), Field(name="state", dtype=String), Field(name="location_type", dtype=String), Field(name="tax_returns_filed", dtype=Int64), Field(name="population", dtype=Int64), Field(name="total_wages", dtype=Int64), ], batch_source=zipcode_source, tags={ "date_added": "2022-02-7", "experiments": "experiment-A,experiment-B,experiment-C", "access_group": "*****@*****.**", }, online=True, )