def test_diff_between_feature_views(simple_dataset_1): with prep_file_source(df=simple_dataset_1, event_timestamp_column="ts_1") as file_source: pre_changed = FeatureView( name="fv2", entities=["id"], batch_source=file_source, ttl=None, tags={ "when": "before" }, ).to_proto() post_changed = FeatureView( name="fv2", entities=["id"], batch_source=file_source, ttl=None, tags={ "when": "after" }, ).to_proto() fco_diffs = diff_between(pre_changed, pre_changed, "feature view") assert len(fco_diffs.fco_property_diffs) == 0 fco_diffs = diff_between(pre_changed, post_changed, "feature view") assert len(fco_diffs.fco_property_diffs) == 1 assert fco_diffs.fco_property_diffs[0].property_name == "tags" assert fco_diffs.fco_property_diffs[0].val_existing == { "when": "before" } assert fco_diffs.fco_property_diffs[0].val_declared == { "when": "after" }
def test_feature_view_inference_success(test_feature_store, dataframe_source): with prep_file_source( df=dataframe_source, event_timestamp_column="ts_1" ) as file_source: fv1 = FeatureView( name="fv1", entities=["id"], ttl=timedelta(minutes=5), online=True, batch_source=file_source, tags={}, ) fv2 = FeatureView( name="fv2", entities=["id"], ttl=timedelta(minutes=5), online=True, batch_source=simple_bq_source_using_table_ref_arg(dataframe_source, "ts_1"), tags={}, ) fv3 = FeatureView( name="fv3", entities=["id"], ttl=timedelta(minutes=5), online=True, batch_source=simple_bq_source_using_query_arg(dataframe_source, "ts_1"), tags={}, ) test_feature_store.apply([fv1, fv2, fv3]) # Register Feature Views feature_view_1 = test_feature_store.list_feature_views()[0] feature_view_2 = test_feature_store.list_feature_views()[1] feature_view_3 = test_feature_store.list_feature_views()[2] actual_file_source = { (feature.name, feature.dtype) for feature in feature_view_1.features } actual_bq_using_table_ref_arg_source = { (feature.name, feature.dtype) for feature in feature_view_2.features } actual_bq_using_query_arg_source = { (feature.name, feature.dtype) for feature in feature_view_3.features } expected = { ("float_col", ValueType.DOUBLE), ("int64_col", ValueType.INT64), ("string_col", ValueType.STRING), } assert ( expected == actual_file_source == actual_bq_using_table_ref_arg_source == actual_bq_using_query_arg_source ) test_feature_store.teardown()
def test_update_entities_with_inferred_types_from_feature_views( simple_dataset_1, simple_dataset_2): with prep_file_source( df=simple_dataset_1, event_timestamp_column="ts_1") as file_source, prep_file_source( df=simple_dataset_2, event_timestamp_column="ts_1") as file_source_2: fv1 = FeatureView( name="fv1", entities=["id"], batch_source=file_source, ttl=None, ) fv2 = FeatureView( name="fv2", entities=["id"], batch_source=file_source_2, ttl=None, ) actual_1 = Entity(name="id", join_key="id_join_key") actual_2 = Entity(name="id", join_key="id_join_key") update_entities_with_inferred_types_from_feature_views( [actual_1], [fv1], RepoConfig(provider="local", project="test")) update_entities_with_inferred_types_from_feature_views( [actual_2], [fv2], RepoConfig(provider="local", project="test")) assert actual_1 == Entity(name="id", join_key="id_join_key", value_type=ValueType.INT64) assert actual_2 == Entity(name="id", join_key="id_join_key", value_type=ValueType.STRING) with pytest.raises(RegistryInferenceFailure): # two viable data types update_entities_with_inferred_types_from_feature_views( [Entity(name="id", join_key="id_join_key")], [fv1, fv2], RepoConfig(provider="local", project="test"), )
def test_reapply_feature_view_success(test_feature_store, dataframe_source): with prep_file_source(df=dataframe_source, event_timestamp_column="ts_1") as file_source: e = Entity(name="id", join_keys=["id_join_key"], value_type=ValueType.STRING) # Create Feature View fv1 = FeatureView( name="my_feature_view_1", schema=[Field(name="string_col", dtype=String)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 0 # Run materialization test_feature_store.materialize(datetime(2020, 1, 1), datetime(2021, 1, 1)) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 1 # Apply again test_feature_store.apply([fv1]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 1 # Change and apply Feature View fv1 = FeatureView( name="my_feature_view_1", schema=[Field(name="int64_col", dtype=Int64)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) test_feature_store.apply([fv1]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 0 test_feature_store.teardown()
def test_tag_objects_for_keep_delete_update_add(simple_dataset_1): with prep_file_source(df=simple_dataset_1, event_timestamp_column="ts_1") as file_source: to_delete = FeatureView( name="to_delete", entities=["id"], batch_source=file_source, ttl=None, ) unchanged_fv = FeatureView( name="fv1", entities=["id"], batch_source=file_source, ttl=None, ) pre_changed = FeatureView( name="fv2", entities=["id"], batch_source=file_source, ttl=None, tags={"when": "before"}, ) post_changed = FeatureView( name="fv2", entities=["id"], batch_source=file_source, ttl=None, tags={"when": "after"}, ) to_add = FeatureView( name="to_add", entities=["id"], batch_source=file_source, ttl=None, ) keep, delete, update, add = tag_objects_for_keep_delete_update_add( [unchanged_fv, pre_changed, to_delete], [unchanged_fv, post_changed, to_add]) assert len(list(keep)) == 2 assert unchanged_fv in keep assert pre_changed in keep assert post_changed not in keep assert len(list(delete)) == 1 assert to_delete in delete assert len(list(update)) == 2 assert unchanged_fv in update assert post_changed in update assert pre_changed not in update assert len(list(add)) == 1 assert to_add in add
def test_update_data_sources_with_inferred_event_timestamp_col( simple_dataset_1): df_with_two_viable_timestamp_cols = simple_dataset_1.copy(deep=True) df_with_two_viable_timestamp_cols["ts_2"] = simple_dataset_1["ts_1"] with prep_file_source(df=simple_dataset_1) as file_source: data_sources = [ file_source, simple_bq_source_using_table_ref_arg(simple_dataset_1), simple_bq_source_using_query_arg(simple_dataset_1), ] update_data_sources_with_inferred_event_timestamp_col( data_sources, RepoConfig(provider="local", project="test")) actual_event_timestamp_cols = [ source.event_timestamp_column for source in data_sources ] assert actual_event_timestamp_cols == ["ts_1", "ts_1", "ts_1"] with prep_file_source(df=df_with_two_viable_timestamp_cols) as file_source: with pytest.raises(RegistryInferenceFailure): # two viable event_timestamp_columns update_data_sources_with_inferred_event_timestamp_col( [file_source], RepoConfig(provider="local", project="test"))
def test_write_to_online_store_event_check(local_redis_environment): if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True": return fs = local_redis_environment.feature_store # write same data points 3 with different timestamps now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") hour_ago = pd.Timestamp(datetime.datetime.utcnow() - timedelta(hours=1)).round("ms") latest = pd.Timestamp(datetime.datetime.utcnow() + timedelta(seconds=1)).round("ms") data = { "id": [123, 567, 890], "string_col": ["OLD_FEATURE", "LATEST_VALUE2", "LATEST_VALUE3"], "ts_1": [hour_ago, now, now], } dataframe_source = pd.DataFrame(data) with prep_file_source(df=dataframe_source, event_timestamp_column="ts_1") as file_source: e = Entity(name="id", value_type=ValueType.STRING) # Create Feature View fv1 = FeatureView( name="feature_view_123", schema=[Field(name="string_col", dtype=String)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View and Entity fs.apply([fv1, e]) # data to ingest into Online Store (recent) data = { "id": [123], "string_col": ["hi_123"], "ts_1": [now], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features(features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }]).to_df() assert df["string_col"].iloc[0] == "hi_123" # data to ingest into Online Store (1 hour delayed data) # should now overwrite features for id=123 because it's less recent data data = { "id": [123, 567, 890], "string_col": ["bye_321", "hello_123", "greetings_321"], "ts_1": [hour_ago, hour_ago, hour_ago], } df_data = pd.DataFrame(data) # directly ingest data into the Online Store fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "hi_123" assert df["string_col"].iloc[1] == "hello_123" assert df["string_col"].iloc[2] == "greetings_321" # should overwrite string_col for id=123 because it's most recent based on event_timestamp data = { "id": [123], "string_col": ["LATEST_VALUE"], "ts_1": [latest], } df_data = pd.DataFrame(data) fs.write_to_online_store("feature_view_123", df_data) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "LATEST_VALUE" assert df["string_col"].iloc[1] == "hello_123" assert df["string_col"].iloc[2] == "greetings_321" # writes to online store via datasource (dataframe_source) materialization fs.materialize( start_date=datetime.datetime.now() - timedelta(hours=12), end_date=datetime.datetime.utcnow(), ) df = fs.get_online_features( features=["feature_view_123:string_col"], entity_rows=[{ "id": 123 }, { "id": 567 }, { "id": 890 }], ).to_df() assert df["string_col"].iloc[0] == "LATEST_VALUE" assert df["string_col"].iloc[1] == "LATEST_VALUE2" assert df["string_col"].iloc[2] == "LATEST_VALUE3"