def test_historical_features_with_missing_request_data(environment, universal_data_sources, full_feature_names): store = environment.feature_store (_, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=datasets.entity_df, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, )
def test_push_features_and_read(environment, universal_data_sources): store = environment.feature_store (_, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) data = { "location_id": [1], "temperature": [4], "event_timestamp": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")], } df_ingest = pd.DataFrame(data) store.push("location_stats_push_source", df_ingest) online_resp = store.get_online_features( features=["pushable_location_stats:temperature"], entity_rows=[{ "location_id": 1 }], ) online_resp_dict = online_resp.to_dict() assert online_resp_dict["location_id"] == [1] assert online_resp_dict["temperature"] == [4]
def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources): config = IntegrationTestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) import os if "AWS_ACCESS_KEY_ID" in os.environ: raise Exception( "AWS_ACCESS_KEY_ID has already been set in the environment. Setting it again may cause a conflict. " "It may be better to deduplicate AWS configuration or use sub-processes for isolation" ) os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE" os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" with construct_test_environment(config) as environment: fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) out = fs.get_online_features( features=["driver_stats:conv_rate"], entity_rows=[{"driver": 5001}] ).to_dict() assert out["conv_rate"][0] is not None del os.environ["AWS_ACCESS_KEY_ID"] del os.environ["AWS_SECRET_ACCESS_KEY"]
def test_historical_retrieval_with_validation(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) # Create two identical retrieval jobs entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) reference_job = store.get_historical_features( entity_df=entity_df, features=_features, ) job = store.get_historical_features( entity_df=entity_df, features=_features, ) # Save dataset using reference job and retrieve it store.create_saved_dataset( from_=reference_job, name="my_training_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), ) saved_dataset = store.get_saved_dataset("my_training_dataset") # If validation pass there will be no exceptions on this point reference = saved_dataset.as_reference(profiler=configurable_profiler) job.to_df(validation_reference=reference)
def test_online_retrieval_with_event_timestamps(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) fs.apply([driver(), feature_views.driver, feature_views.global_fv]) # fake data to ingest into Online Store data = { "driver_id": [1, 2], "conv_rate": [0.5, 0.3], "acc_rate": [0.6, 0.4], "avg_daily_trips": [4, 5], "event_timestamp": [ pd.to_datetime(1646263500, utc=True, unit="s"), pd.to_datetime(1646263600, utc=True, unit="s"), ], "created": [ pd.to_datetime(1646263500, unit="s"), pd.to_datetime(1646263600, unit="s"), ], } df_ingest = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("driver_stats", df_ingest) response = fs.get_online_features( features=[ "driver_stats:avg_daily_trips", "driver_stats:acc_rate", "driver_stats:conv_rate", ], entity_rows=[{ "driver_id": 1 }, { "driver_id": 2 }], ) df = response.to_df(True) assertpy.assert_that(len(df)).is_equal_to(2) assertpy.assert_that(df["driver_id"].iloc[0]).is_equal_to(1) assertpy.assert_that(df["driver_id"].iloc[1]).is_equal_to(2) assertpy.assert_that(df["avg_daily_trips" + TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500) assertpy.assert_that(df["avg_daily_trips" + TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600) assertpy.assert_that(df["acc_rate" + TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500) assertpy.assert_that(df["acc_rate" + TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600) assertpy.assert_that(df["conv_rate" + TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500) assertpy.assert_that(df["conv_rate" + TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600)
def test_online_retrieval(environment, universal_data_sources, benchmark): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"] ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location(), feature_service]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) sample_drivers = random.sample(entities["driver"], 10) sample_customers = random.sample(entities["customer"], 10) entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") benchmark( fs.get_online_features, features=feature_refs, entity_rows=entity_rows, )
def setup_python_fs_client(): config = IntegrationTestRepoConfig() environment = construct_test_environment(config) fs = environment.feature_store try: entities, datasets, data_sources = construct_universal_test_data( environment) feature_views = construct_universal_feature_views(data_sources) feast_objects: List[FeastObject] = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), location()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) client = TestClient(get_app(fs)) yield client finally: fs.teardown() environment.data_source_creator.teardown()
def test_historical_retrieval_fails_on_validation(environment, universal_data_sources): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) reference_job = store.get_historical_features( entity_df=entity_df, features=_features, ) store.create_saved_dataset( from_=reference_job, name="my_other_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), ) job = store.get_historical_features( entity_df=entity_df, features=_features, ) with pytest.raises(ValidationFailed) as exc_info: job.to_df(validation_reference=store.get_saved_dataset( "my_other_dataset").as_reference( profiler=profiler_with_unrealistic_expectations)) failed_expectations = exc_info.value.report.errors assert len(failed_expectations) == 2 assert failed_expectations[ 0].check_name == "expect_column_max_to_be_between" assert failed_expectations[0].column_name == "current_balance" assert failed_expectations[ 1].check_name == "expect_column_values_to_be_in_set" assert failed_expectations[1].column_name == "avg_passenger_count"
def test_online_store_cleanup(environment, universal_data_sources): """ Some online store implementations (like Redis) keep features from different features views but with common entities together. This might end up with deletion of all features attached to the entity, when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). Plan: 1. Register two feature views with common entity "driver" 2. Materialize data 3. Check if features are available (via online retrieval) 4. Delete one feature view 5. Check that features for other are still available 6. Delete another feature view (and create again) 7. Verify that features for both feature view were deleted """ fs = environment.feature_store entities, datasets, data_sources = universal_data_sources driver_stats_fv = construct_universal_feature_views(data_sources).driver driver_entities = entities.driver_vals df = pd.DataFrame({ "ts_1": [environment.end_date] * len(driver_entities), "created_ts": [environment.end_date] * len(driver_entities), "driver_id": driver_entities, "value": np.random.random(size=len(driver_entities)), }) ds = environment.data_source_creator.create_data_source( df, destination_name="simple_driver_dataset") simple_driver_fv = driver_feature_view( data_source=ds, name="test_universal_online_simple_driver") fs.apply([driver(), simple_driver_fv, driver_stats_fv]) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) expected_values = df.sort_values(by="driver_id") features = [f"{simple_driver_fv.name}:value"] entity_rows = [{ "driver_id": driver_id } for driver_id in sorted(driver_entities)] online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert np.allclose(expected_values["value"], online_features["value"]) fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) def eventually_apply() -> Tuple[None, bool]: try: fs.apply([simple_driver_fv]) except BotoCoreError: return None, False return None, True # Online store backend might have eventual consistency in schema update # So recreating table that was just deleted might need some retries wait_retry_backoff(eventually_apply, timeout_secs=60) online_features = fs.get_online_features( features=features, entity_rows=entity_rows).to_dict() assert all(v is None for v in online_features["value"])
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv, feature_views.customer[["current_balance"]], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets.orders_df.sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets.orders_df[( datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets.driver_df[datasets.driver_df["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets.customer_df[ datasets.customer_df["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities.location_vals, 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T.tolist() origins_df = datasets.location_df[datasets.location_df["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets.location_df[ datasets.location_df["location_id"].isin(sample_location_pairs[1])] global_df = datasets.global_df entity_rows = [{ "driver_id": d, "customer_id": c, "val_to_add": 50 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) # Test that the on demand feature views compute properly even if the dependent conv_rate # feature isn't requested. online_features_no_conv_rate = get_online_features_dict( environment=environment, features=[ ref for ref in feature_refs if ref != "driver_stats:conv_rate" ], entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features_no_conv_rate is not None keys = set(online_features_dict.keys()) expected_keys = set( f.replace(":", "__") if full_feature_names else f.split(":")[-1] for f in feature_refs) | {"customer_id", "driver_id"} assert ( keys == expected_keys ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", feature_refs, full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", feature_refs, full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0, "val_to_add": 100 }], full_feature_names=full_feature_names, ) assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, feature_refs, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): get_online_features_dict( environment=environment, features=feature_refs, entity_rows=[{ "driver_id": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ) assert_feature_service_correctness( environment, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "origin_id": origin, "destination_id": destination } for (_driver, _customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( environment, feature_service_entity_mapping, entity_rows, full_feature_names, origins_df, destinations_df, )
def test_historical_features_persisting(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) store.apply([driver(), customer(), location(), *feature_views.values()]) entity_df = datasets.entity_df.drop( columns=["order_id", "origin_id", "destination_id"]) job = store.get_historical_features( entity_df=entity_df, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) saved_dataset = store.create_saved_dataset( from_=job, name="saved_dataset", storage=environment.data_source_creator. create_saved_dataset_destination(), tags={"env": "test"}, ) event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df, event_timestamp, full_feature_names, ).drop(columns=[ response_feature_name("conv_rate_plus_100", full_feature_names), response_feature_name("conv_rate_plus_100_rounded", full_feature_names), response_feature_name("avg_daily_trips", full_feature_names), response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ]) assert_frame_equal( expected_df, saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], ) assert_frame_equal( job.to_df(), saved_dataset.to_df(), keys=[event_timestamp, "driver_id", "customer_id"], )
def test_historical_features_with_entities_from_query(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) orders_table = table_name_from_data_source(data_sources.orders) if not orders_table: raise pytest.skip("Offline source is not sql-based") data_source_creator = environment.test_repo_config.offline_store_creator if data_source_creator.__name__ == SnowflakeDataSourceCreator.__name__: entity_df_query = f""" SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp" FROM "{orders_table}" """ else: entity_df_query = f""" SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table} """ store.apply([driver(), customer(), location(), *feature_views.values()]) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, datasets.entity_df, event_timestamp, full_feature_names, ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = full_expected_df.drop(columns=[ response_feature_name("conv_rate_plus_100", full_feature_names), response_feature_name("conv_rate_plus_100_rounded", full_feature_names), response_feature_name("avg_daily_trips", full_feature_names), response_feature_name("conv_rate", full_feature_names), "origin__temperature", "destination__temperature", ]) assert_frame_equal( expected_df_query, actual_df_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) table_from_sql_entities = job_from_sql.to_arrow().to_pandas() for col in table_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( table_from_sql_entities[col].dtype) assert_frame_equal( expected_df_query, table_from_sql_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) entity_df_with_request_data = datasets.entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] feature_service = FeatureService( name="convrate_plus100", features=[ feature_views.driver[["conv_rate"]], feature_views.driver_odfv ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views.location.with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views.location.with_name("destination").with_join_key_map( {"location_id": "destination_id"}), ], ) store.apply([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, *feature_views.values(), ]) event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in datasets.orders_df.columns else "e_ts") full_expected_df = get_expected_training_df( datasets.customer_df, feature_views.customer, datasets.driver_df, feature_views.driver, datasets.orders_df, feature_views.order, datasets.location_df, feature_views.location, datasets.global_df, feature_views.global_fv, datasets.field_mapping_df, feature_views.field_mapping, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print( f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}" ) end_time = datetime.utcnow() print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df, actual_df_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() assert_frame_equal( expected_df, table_from_df_entities, keys=[event_timestamp, "order_id", "driver_id", "customer_id"], )
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"], feature_views["driver_age_request_fv"], ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ feature_views["location"].with_name("origin").with_join_key_map( {"location_id": "origin_id"}), feature_views["location"].with_name( "destination").with_join_key_map( {"location_id": "destination_id"}), ], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([ driver(), customer(), location(), feature_service, feature_service_entity_mapping, ]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[[ "customer_id", "driver_id", "order_id", "event_timestamp" ]] orders_df = datasets["orders"][( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]))] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][datasets["driver"]["driver_id"].isin( sample_drivers)] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][datasets["customer"] ["customer_id"].isin(sample_customers)] location_pairs = np.array( list(itertools.permutations(entities["location"], 2))) sample_location_pairs = location_pairs[np.random.choice( len(location_pairs), 10)].T origins_df = datasets["location"][datasets["location"]["location_id"].isin( sample_location_pairs[0])] destinations_df = datasets["location"][ datasets["location"]["location_id"].isin(sample_location_pairs[1])] global_df = datasets["global"] entity_rows = [{ "driver": d, "customer_id": c, "val_to_add": 50, "driver_age": 25 } for (d, c) in zip(sample_drivers, sample_customers)] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ] unprefixed_feature_refs = [ f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f ] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: # full_feature_names does not apply to request feature views if full_feature_names and feature != "driver_age:driver_age": assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ("driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( driver_df=drivers_df, customer_df=customers_df, orders_df=orders_df, global_df=global_df, entity_row=entity_row, ) assert df_features["customer_id"] == online_features_dict[ "customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] tc.assertAlmostEqual( online_features_dict[response_feature_name("conv_rate_plus_100", full_feature_names)][i], df_features["conv_rate"] + 100, delta=0.0001, ) tc.assertAlmostEqual( online_features_dict[response_feature_name( "conv_rate_plus_val_to_add", full_feature_names)][i], df_features["conv_rate"] + df_features["val_to_add"], delta=0.0001, ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertAlmostEqual( df_features[unprefixed_feature_ref], online_features_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][i], delta=0.0001, ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 100, "driver_age": 125 }], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in { "num_rides", "avg_ride_length", "driver_age" }: tc.assertIsNone(missing_responses_dict[response_feature_name( unprefixed_feature_ref, full_feature_names)][0]) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0 }], full_feature_names=full_feature_names, ).to_dict() # Also with request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{ "driver": 0, "customer_id": 0, "val_to_add": 20 }], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, ) entity_rows = [{ "driver": driver, "customer_id": customer, "origin_id": origin, "destination_id": destination, } for (driver, customer, origin, destination ) in zip(sample_drivers, sample_customers, *sample_location_pairs)] assert_feature_service_entity_mapping_correctness( fs, feature_service_entity_mapping, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, origins_df, destinations_df, )
def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store (entities, datasets, data_sources) = universal_data_sources feature_views = construct_universal_feature_views(data_sources) customer_df, driver_df, location_df, orders_df, global_df, entity_df = ( datasets["customer"], datasets["driver"], datasets["location"], datasets["orders"], datasets["global"], datasets["entity"], ) entity_df_with_request_data = entity_df.copy(deep=True) entity_df_with_request_data["val_to_add"] = [ i for i in range(len(entity_df_with_request_data)) ] entity_df_with_request_data["driver_age"] = [ i + 100 for i in range(len(entity_df_with_request_data)) ] ( customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, ) = ( feature_views["customer"], feature_views["driver"], feature_views["driver_odfv"], feature_views["location"], feature_views["order"], feature_views["global"], feature_views["driver_age_request_fv"], ) feature_service = FeatureService( name="convrate_plus100", features=[ feature_views["driver"][["conv_rate"]], driver_odfv, driver_age_request_fv, ], ) feature_service_entity_mapping = FeatureService( name="entity_mapping", features=[ location_fv.with_name("origin").with_join_key_map( {"location_id": "origin_id"} ), location_fv.with_name("destination").with_join_key_map( {"location_id": "destination_id"} ), ], ) feast_objects = [] feast_objects.extend( [ customer_fv, driver_fv, driver_odfv, location_fv, order_fv, global_fv, driver_age_request_fv, driver(), customer(), location(), feature_service, feature_service_entity_mapping, ] ) store.apply(feast_objects) entity_df_query = None orders_table = table_name_from_data_source(data_sources["orders"]) if orders_table: entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}" event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts" ) full_expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, order_fv, location_df, location_fv, global_df, global_fv, entity_df_with_request_data, event_timestamp, full_feature_names, ) # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) if entity_df_query: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'") ) # Not requesting the on demand transform with an entity_df query (can't add request data in them) expected_df_query = expected_df.drop( columns=[ "conv_rate_plus_100", "conv_rate_plus_100_rounded", "val_to_add", "conv_rate_plus_val_to_add", "driver_age", ] ) assert sorted(expected_df_query.columns) == sorted( actual_df_from_sql_entities.columns ) actual_df_from_sql_entities = ( actual_df_from_sql_entities[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) expected_df_query = ( expected_df_query.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( actual_df_from_sql_entities, expected_df_query, check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() df_from_sql_entities = ( table_from_sql_entities.to_pandas()[expected_df_query.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) for col in df_from_sql_entities.columns: expected_df_query[col] = expected_df_query[col].astype( df_from_sql_entities[col].dtype ) assert_frame_equal(expected_df_query, df_from_sql_entities) job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_100_rounded", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}") end_time = datetime.utcnow() print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns) expected_df: pd.DataFrame = ( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) .drop_duplicates() .reset_index(drop=True) ) actual_df_from_df_entities = ( actual_df_from_df_entities[expected_df.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal( expected_df, actual_df_from_df_entities, check_dtype=False, ) assert_feature_service_correctness( store, feature_service, full_feature_names, entity_df_with_request_data, expected_df, event_timestamp, ) assert_feature_service_entity_mapping_correctness( store, feature_service_entity_mapping, full_feature_names, entity_df_with_request_data, full_expected_df, event_timestamp, ) table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() columns_expected_in_table = expected_df.columns.tolist() table_from_df_entities = ( table_from_df_entities[columns_expected_in_table] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) assert_frame_equal(actual_df_from_df_entities, table_from_df_entities) # If request data is missing that's needed for on demand transform, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, ) # If request data is missing that's needed for a request feature view, throw an error with pytest.raises(RequestDataNotFoundInEntityDfException): store.get_historical_features( entity_df=entity_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "driver_age:driver_age", "global_stats:num_rides", "global_stats:avg_ride_length", ], full_feature_names=full_feature_names, )
def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) feature_service = FeatureService( "convrate_plus100", features=[feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"]], ) feast_objects = [] feast_objects.extend(feature_views.values()) feast_objects.extend([driver(), customer(), feature_service]) fs.apply(feast_objects) fs.materialize( environment.start_date - timedelta(days=1), environment.end_date + timedelta(days=1), ) entity_sample = datasets["orders"].sample(10)[ ["customer_id", "driver_id", "order_id", "event_timestamp"] ] orders_df = datasets["orders"][ ( datasets["orders"]["customer_id"].isin(entity_sample["customer_id"]) & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]) ) ] sample_drivers = entity_sample["driver_id"] drivers_df = datasets["driver"][ datasets["driver"]["driver_id"].isin(sample_drivers) ] sample_customers = entity_sample["customer_id"] customers_df = datasets["customer"][ datasets["customer"]["customer_id"].isin(sample_customers) ] global_df = datasets["global"] entity_rows = [ {"driver": d, "customer_id": c, "val_to_add": 50} for (d, c) in zip(sample_drivers, sample_customers) ] feature_refs = [ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", "conv_rate_plus_100:conv_rate_plus_100", "conv_rate_plus_100:conv_rate_plus_val_to_add", "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", ] unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] # Remove the on demand feature view output features, since they're not present in the source dataframe unprefixed_feature_refs.remove("conv_rate_plus_100") unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") online_features = fs.get_online_features( features=feature_refs, entity_rows=entity_rows, full_feature_names=full_feature_names, ) assert online_features is not None online_features_dict = online_features.to_dict() keys = online_features_dict.keys() assert ( len(keys) == len(feature_refs) + 3 ) # Add three for the driver id and the customer id entity keys + val_to_add request data. for feature in feature_refs: if full_feature_names: assert feature.replace(":", "__") in keys else: assert feature.rsplit(":", 1)[-1] in keys assert ( "driver_stats" not in keys and "customer_profile" not in keys and "order" not in keys and "global_stats" not in keys ) tc = unittest.TestCase() for i, entity_row in enumerate(entity_rows): df_features = get_latest_feature_values_from_dataframes( drivers_df, customers_df, orders_df, global_df, entity_row ) assert df_features["customer_id"] == online_features_dict["customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] assert ( online_features_dict[ response_feature_name("conv_rate_plus_100", full_feature_names) ][i] == df_features["conv_rate"] + 100 ) assert ( online_features_dict[ response_feature_name("conv_rate_plus_val_to_add", full_feature_names) ][i] == df_features["conv_rate"] + df_features["val_to_add"] ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertEqual( df_features[unprefixed_feature_ref], online_features_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][i], ) # Check what happens for missing values missing_responses_dict = fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0, "val_to_add": 100}], full_feature_names=full_feature_names, ).to_dict() assert missing_responses_dict is not None for unprefixed_feature_ref in unprefixed_feature_refs: if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: tc.assertIsNone( missing_responses_dict[ response_feature_name(unprefixed_feature_ref, full_feature_names) ][0] ) # Check what happens for missing request data with pytest.raises(RequestDataNotFoundInEntityRowsException): fs.get_online_features( features=feature_refs, entity_rows=[{"driver": 0, "customer_id": 0}], full_feature_names=full_feature_names, ).to_dict() assert_feature_service_correctness( fs, feature_service, entity_rows, full_feature_names, drivers_df, customers_df, orders_df, global_df, )