def test_telemetry_on(): old_environ = dict(os.environ) test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" os.environ["FEAST_TELEMETRY"] = "True" with tempfile.TemporaryDirectory() as temp_dir: test_feature_store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="fake_project", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online.db") ), ) ) entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) test_feature_store.apply([entity]) os.environ.clear() os.environ.update(old_environ) ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={"ts_1": "ts", "id": "driver_id"}, ) fv = get_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db") ), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv
def feature_store_with_local_registry(): fd, registry_path = mkstemp() fd, online_store_path = mkstemp() return FeatureStore(config=RepoConfig( registry=registry_path, project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=online_store_path), ))
def test_apply_remote_repo(): fd, registry_path = mkstemp() fd, online_store_path = mkstemp() return FeatureStore(config=RepoConfig( registry=registry_path, project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=online_store_path), ))
def test_historical_features_from_bigquery_sources( provider_type, infer_event_timestamp_col ): start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) # bigquery_dataset = "test_hist_retrieval_static" bigquery_dataset = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: gcp_project = bigquery.Client().project # Orders Query table_id = f"{bigquery_dataset}.orders" stage_orders_bigquery(orders_df, table_id) entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" # Driver Feature View driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date ) driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" stage_driver_hourly_stats_bigquery_source(driver_df, driver_table_id) driver_source = BigQuerySource( table_ref=driver_table_id, event_timestamp_column="datetime", created_timestamp_column="created", ) driver_fv = create_driver_hourly_stats_feature_view(driver_source) # Customer Feature View customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date ) customer_table_id = f"{gcp_project}.{bigquery_dataset}.customer_profile" stage_customer_daily_profile_bigquery_source(customer_df, customer_table_id) customer_source = BigQuerySource( table_ref=customer_table_id, event_timestamp_column="datetime", created_timestamp_column="", ) customer_fv = create_customer_daily_profile_feature_view(customer_source) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) if provider_type == "local": store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online_store.db"), ), offline_store=BigQueryOfflineStoreConfig(type="bigquery",), ) ) elif provider_type == "gcp": store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10) ), provider="gcp", offline_store=BigQueryOfflineStoreConfig(type="bigquery",), ) ) elif provider_type == "gcp_custom_offline_config": store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10) ), provider="gcp", offline_store=BigQueryOfflineStoreConfig( type="bigquery", dataset="foo" ), ) ) else: raise Exception("Invalid provider used as part of test configuration") store.apply([driver, customer, driver_fv, customer_fv]) event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts" ) expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, ) job_from_sql = store.get_historical_features( entity_df=entity_df_query, feature_refs=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) actual_df_from_sql_entities = job_from_sql.to_df() assert_frame_equal( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ).reset_index(drop=True), actual_df_from_sql_entities.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ).reset_index(drop=True), check_dtype=False, ) job_from_df = store.get_historical_features( entity_df=orders_df, feature_refs=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) if provider_type == "gcp_custom_offline_config": # Make sure that custom dataset name is being used from the offline_store config assertpy.assert_that(job_from_df.query).contains("foo.entity_df") else: # If the custom dataset name isn't provided in the config, use default `feast` name assertpy.assert_that(job_from_df.query).contains("feast.entity_df") actual_df_from_df_entities = job_from_df.to_df() assert_frame_equal( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ).reset_index(drop=True), actual_df_from_df_entities.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ).reset_index(drop=True), check_dtype=False, )
def test_historical_features_from_parquet_sources(infer_event_timestamp_col): start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) with TemporaryDirectory() as temp_dir: driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date ) driver_source = stage_driver_hourly_stats_parquet_source(temp_dir, driver_df) driver_fv = create_driver_hourly_stats_feature_view(driver_source) customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date ) customer_source = stage_customer_daily_profile_parquet_source( temp_dir, customer_df ) customer_fv = create_customer_daily_profile_feature_view(customer_source) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online_store.db") ), ) ) store.apply([driver, customer, driver_fv, customer_fv]) job = store.get_historical_features( entity_df=orders_df, feature_refs=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) actual_df = job.to_df() event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts" ) expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, ) assert_frame_equal( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ).reset_index(drop=True), actual_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ).reset_index(drop=True), )