def test_apply_feature_view_success(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", schema=[ Field(name="fs1_my_feature_1", dtype=Int64), Field(name="fs1_my_feature_2", dtype=String), Field(name="fs1_my_feature_3", dtype=Array(String)), Field(name="fs1_my_feature_4", dtype=Array(Bytes)), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == Int64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == String and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == Array(String) and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == Array(Bytes) and feature_views[0].entities[0] == "fs1_my_entity_1") test_feature_store.teardown()
def test_apply_feature_view_success(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, input=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert ( len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1" )
def get_feature_view(self, name: str, project: str) -> FeatureView: """ Retrieves a feature view. Args: name: Name of feature view project: Feast project that this feature view belongs to Returns: Returns either the specified feature view, or raises an exception if none is found """ registry_proto = self._get_registry_proto() for feature_view_proto in registry_proto.feature_views: if (feature_view_proto.spec.name == name and feature_view_proto.spec.project == project): return FeatureView.from_proto(feature_view_proto) raise Exception( f"Feature view {name} does not exist in project {project}")
def list_feature_views( self, project: str, allow_cache: bool = False ) -> List[FeatureView]: """ Retrieve a list of feature views from the registry Args: allow_cache: Allow returning feature views from the cached registry project: Filter feature tables based on project name Returns: List of feature views """ registry_proto = self._get_registry_proto(allow_cache=allow_cache) feature_views = [] for feature_view_proto in registry_proto.feature_views: if feature_view_proto.spec.project == project: feature_views.append(FeatureView.from_proto(feature_view_proto)) return feature_views
def from_proto(cls, on_demand_feature_view_proto: OnDemandFeatureViewProto): """ Creates an on demand feature view from a protobuf representation. Args: on_demand_feature_view_proto: A protobuf representation of an on-demand feature view. Returns: A OnDemandFeatureView object based on the on-demand feature view protobuf. """ inputs = {} for ( input_name, on_demand_input, ) in on_demand_feature_view_proto.spec.inputs.items(): if on_demand_input.WhichOneof("input") == "feature_view": inputs[input_name] = FeatureView.from_proto( on_demand_input.feature_view) else: inputs[input_name] = RequestDataSource.from_proto( on_demand_input.request_data_source) on_demand_feature_view_obj = cls( name=on_demand_feature_view_proto.spec.name, features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=dict(feature.labels), ) for feature in on_demand_feature_view_proto.spec.features ], inputs=inputs, udf=dill.loads( on_demand_feature_view_proto.spec.user_defined_function.body), ) # FeatureViewProjections are not saved in the OnDemandFeatureView proto. # Create the default projection. on_demand_feature_view_obj.projection = FeatureViewProjection.from_definition( on_demand_feature_view_obj) return on_demand_feature_view_obj
def updater(registry_proto: RegistryProto): for idx, existing_feature_view_proto in enumerate( registry_proto.feature_views ): if ( existing_feature_view_proto.spec.name == feature_view.name and existing_feature_view_proto.spec.project == project ): existing_feature_view = FeatureView.from_proto( existing_feature_view_proto ) existing_feature_view.materialization_intervals.append( (start_date, end_date) ) feature_view_proto = existing_feature_view.to_proto() feature_view_proto.spec.project = project del registry_proto.feature_views[idx] registry_proto.feature_views.append(feature_view_proto) return registry_proto raise FeatureViewNotFoundException(feature_view.name, project)
def apply_materialization( self, feature_view: FeatureView, project: str, start_date: datetime, end_date: datetime, commit: bool = True, ): """ Updates materialization intervals tracked for a single feature view in Feast Args: feature_view: Feature view that will be updated with an additional materialization interval tracked project: Feast project that this feature view belongs to start_date (datetime): Start date of the materialization interval to track end_date (datetime): End date of the materialization interval to track commit: Whether the change should be persisted immediately """ self._prepare_registry_for_changes() assert self.cached_registry_proto for idx, existing_feature_view_proto in enumerate( self.cached_registry_proto.feature_views): if (existing_feature_view_proto.spec.name == feature_view.name and existing_feature_view_proto.spec.project == project): existing_feature_view = FeatureView.from_proto( existing_feature_view_proto) existing_feature_view.materialization_intervals.append( (start_date, end_date)) existing_feature_view.last_updated_timestamp = datetime.utcnow( ) feature_view_proto = existing_feature_view.to_proto() feature_view_proto.spec.project = project del self.cached_registry_proto.feature_views[idx] self.cached_registry_proto.feature_views.append( feature_view_proto) if commit: self.commit() return raise FeatureViewNotFoundException(feature_view.name, project)
def updater(registry_proto: RegistryProto): for idx, existing_feature_view_proto in enumerate( registry_proto.feature_views ): if ( existing_feature_view_proto.spec.name == feature_view_proto.spec.name and existing_feature_view_proto.spec.project == project ): # do not update if feature view has not changed; updating will erase tracked materialization intervals if ( FeatureView.from_proto(existing_feature_view_proto) == feature_view ): return registry_proto else: del registry_proto.feature_views[idx] registry_proto.feature_views.append(feature_view_proto) return registry_proto registry_proto.feature_views.append(feature_view_proto) return registry_proto
def _localize_feature_view(self, feature_view: FeatureView): """ This function ensures that the `FeatureView` object points to files in the local disk """ if not isinstance(feature_view.batch_source, FileSource): return # Copy parquet file to a local file file_source: FileSource = feature_view.batch_source random_local_path = ( FlyteContext.current_context().file_access.get_random_local_path( file_source.path)) FlyteContext.current_context().file_access.get_data( file_source.path, random_local_path, is_multipart=True, ) feature_view.batch_source = FileSource( path=random_local_path, event_timestamp_column=file_source.event_timestamp_column, )
def get_feature_view(self, name: str, project: str, allow_cache: bool = False) -> FeatureView: """ Retrieves a feature view. Args: name: Name of feature view project: Feast project that this feature view belongs to allow_cache: Allow returning feature view from the cached registry Returns: Returns either the specified feature view, or raises an exception if none is found """ registry_proto = self._get_registry_proto(allow_cache=allow_cache) for feature_view_proto in registry_proto.feature_views: if (feature_view_proto.spec.name == name and feature_view_proto.spec.project == project): return FeatureView.from_proto(feature_view_proto) raise FeatureViewNotFoundException(name, project)
def test_hash(): file_source = FileSource(name="my-file-source", path="test.parquet") feature_view = FeatureView( name="my-feature-view", entities=[], schema=[ Field(name="feature1", dtype=Float32), Field(name="feature2", dtype=Float32), ], source=file_source, ) feature_service_1 = FeatureService( name="my-feature-service", features=[feature_view[["feature1", "feature2"]]] ) feature_service_2 = FeatureService( name="my-feature-service", features=[feature_view[["feature1", "feature2"]]] ) feature_service_3 = FeatureService( name="my-feature-service", features=[feature_view[["feature1"]]] ) feature_service_4 = FeatureService( name="my-feature-service", features=[feature_view[["feature1"]]], description="test", ) s1 = {feature_service_1, feature_service_2} assert len(s1) == 1 s2 = {feature_service_1, feature_service_3} assert len(s2) == 2 s3 = {feature_service_3, feature_service_4} assert len(s3) == 2 s4 = {feature_service_1, feature_service_2, feature_service_3, feature_service_4} assert len(s4) == 3
def test_feature_view_inference_success(test_feature_store, dataframe_source): with prep_file_source( df=dataframe_source, event_timestamp_column="ts_1" ) as file_source: entity = Entity(name="id", join_key="id_join_key", value_type=ValueType.INT64) fv1 = FeatureView( name="fv1", entities=["id"], ttl=timedelta(minutes=5), online=True, batch_source=file_source, tags={}, ) fv2 = FeatureView( name="fv2", entities=["id"], ttl=timedelta(minutes=5), online=True, batch_source=simple_bq_source_using_table_ref_arg(dataframe_source, "ts_1"), tags={}, ) fv3 = FeatureView( name="fv3", entities=["id"], ttl=timedelta(minutes=5), online=True, batch_source=simple_bq_source_using_query_arg(dataframe_source, "ts_1"), tags={}, ) test_feature_store.apply([entity, fv1, fv2, fv3]) # Register Feature Views feature_view_1 = test_feature_store.list_feature_views()[0] feature_view_2 = test_feature_store.list_feature_views()[1] feature_view_3 = test_feature_store.list_feature_views()[2] actual_file_source = { (feature.name, feature.dtype) for feature in feature_view_1.features } actual_bq_using_table_ref_arg_source = { (feature.name, feature.dtype) for feature in feature_view_2.features } actual_bq_using_query_arg_source = { (feature.name, feature.dtype) for feature in feature_view_3.features } expected = { ("float_col", ValueType.DOUBLE), ("int64_col", ValueType.INT64), ("string_col", ValueType.STRING), } assert ( expected == actual_file_source == actual_bq_using_table_ref_arg_source == actual_bq_using_query_arg_source ) test_feature_store.teardown()
def test_modify_feature_views_success(test_registry, request_source_schema): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", ) request_source = RequestSource( name="request_source", schema=request_source_schema, ) fv1 = FeatureView( name="my_feature_view_1", schema=[Field(name="fs1_my_feature_1", dtype=Int64)], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) @on_demand_feature_view( features=[ Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING), Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), ], sources=[request_source], ) def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: data = pd.DataFrame() data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype( "category") data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32") return data project = "project" # Register Feature Views test_registry.apply_feature_view(odfv1, project) test_registry.apply_feature_view(fv1, project) # Modify odfv by changing a single feature dtype @on_demand_feature_view( features=[ Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT), Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32), ], sources=[request_source], ) def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame: data = pd.DataFrame() data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("float") data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32") return data # Apply the modified odfv test_registry.apply_feature_view(odfv1, project) # Check odfv on_demand_feature_views = test_registry.list_on_demand_feature_views( project) assert ( len(on_demand_feature_views) == 1 and on_demand_feature_views[0].name == "odfv1" and on_demand_feature_views[0].features[0].name == "odfv1_my_feature_1" and on_demand_feature_views[0].features[0].dtype == Float32 and on_demand_feature_views[0].features[1].name == "odfv1_my_feature_2" and on_demand_feature_views[0].features[1].dtype == Int32) request_schema = on_demand_feature_views[0].get_request_data_schema() assert (list(request_schema.keys())[0] == "my_input_1" and list(request_schema.values())[0] == ValueType.INT32) feature_view = test_registry.get_on_demand_feature_view("odfv1", project) assert (feature_view.name == "odfv1" and feature_view.features[0].name == "odfv1_my_feature_1" and feature_view.features[0].dtype == Float32 and feature_view.features[1].name == "odfv1_my_feature_2" and feature_view.features[1].dtype == Int32) request_schema = feature_view.get_request_data_schema() assert (list(request_schema.keys())[0] == "my_input_1" and list(request_schema.values())[0] == ValueType.INT32) # Make sure fv1 is untouched feature_views = test_registry.list_feature_views(project) # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == Int64 and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_registry.get_feature_view("my_feature_view_1", project) assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == Int64 and feature_view.entities[0] == "fs1_my_entity_1") test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def test_apply_feature_view_success(test_registry): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", ) fv1 = FeatureView( name="my_feature_view_1", schema=[ Field(name="fs1_my_feature_1", dtype=Int64), Field(name="fs1_my_feature_2", dtype=String), Field(name="fs1_my_feature_3", dtype=Array(String)), Field(name="fs1_my_feature_4", dtype=Array(Bytes)), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) project = "project" # Register Feature View test_registry.apply_feature_view(fv1, project) feature_views = test_registry.list_feature_views(project) # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == Int64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == String and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == Array(String) and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == Array(Bytes) and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_registry.get_feature_view("my_feature_view_1", project) assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == Int64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == String and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == Array(String) and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == Array(Bytes) and feature_view.entities[0] == "fs1_my_entity_1") test_registry.delete_feature_view("my_feature_view_1", project) feature_views = test_registry.list_feature_views(project) assert len(feature_views) == 0 test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def test_historical_features_from_bigquery_sources_containing_backfills( environment): store = environment.feature_store now = datetime.now().replace(microsecond=0, second=0, minute=0) tomorrow = now + timedelta(days=1) day_after_tomorrow = now + timedelta(days=2) entity_df = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": day_after_tomorrow }, { "driver_id": 1002, "event_timestamp": day_after_tomorrow }, ]) driver_stats_df = pd.DataFrame(data=[ # Duplicated rows simple case { "driver_id": 1001, "avg_daily_trips": 10, "event_timestamp": now, "created": now, }, { "driver_id": 1001, "avg_daily_trips": 20, "event_timestamp": now, "created": tomorrow, }, # Duplicated rows after a backfill { "driver_id": 1002, "avg_daily_trips": 30, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1002, "avg_daily_trips": 40, "event_timestamp": tomorrow, "created": now, }, ]) expected_df = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": day_after_tomorrow, "avg_daily_trips": 20, }, { "driver_id": 1002, "event_timestamp": day_after_tomorrow, "avg_daily_trips": 40, }, ]) driver_stats_data_source = environment.data_source_creator.create_data_source( df=driver_stats_df, destination_name= f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}", timestamp_field="event_timestamp", created_timestamp_column="created", ) driver = Entity(name="driver", join_keys=["driver_id"], value_type=ValueType.INT64) driver_fv = FeatureView( name="driver_stats", entities=["driver"], schema=[Field(name="avg_daily_trips", dtype=Int32)], batch_source=driver_stats_data_source, ttl=None, ) store.apply([driver, driver_fv]) offline_job = store.get_historical_features( entity_df=entity_df, features=["driver_stats:avg_daily_trips"], full_feature_names=False, ) start_time = datetime.utcnow() actual_df = offline_job.to_df() print(f"actual_df shape: {actual_df.shape}") end_time = datetime.utcnow() print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted(actual_df.columns) assert_frame_equal(expected_df, actual_df, keys=["driver_id"])
def test_bigquery_table_to_datastore_correctness(self): # create dataset now = datetime.utcnow() ts = pd.Timestamp(now).round("ms") data = { "id": [1, 2, 1, 3, 3], "value": [0.1, 0.2, 0.3, 4, 5], "ts_1": [ ts - timedelta(seconds=4), ts, ts - timedelta(seconds=3), ts - timedelta(seconds=4), ts - timedelta(seconds=1), ], "created_ts": [ts, ts, ts, ts, ts], } df = pd.DataFrame.from_dict(data) # load dataset into BigQuery job_config = bigquery.LoadJobConfig() table_id = f"{self.gcp_project}.{self.bigquery_dataset}.table_correctness_{int(time.time())}" job = self.client.load_table_from_dataframe(df, table_id, job_config=job_config) job.result() # create FeatureView fv = FeatureView( name="test_bq_table_correctness", entities=["driver_id"], features=[Feature("value", ValueType.FLOAT)], ttl=timedelta(minutes=5), input=BigQuerySource( event_timestamp_column="ts", table_ref=table_id, created_timestamp_column="created_ts", field_mapping={ "ts_1": "ts", "id": "driver_id" }, date_partition_column="", ), ) config = RepoConfig( metadata_store="./metadata.db", project=f"test_bq_table_correctness_{int(time.time())}", provider="gcp", ) fs = FeatureStore(config=config) fs.apply([fv]) # run materialize() fs.materialize( [fv.name], now - timedelta(seconds=5), now - timedelta(seconds=2), ) # check result of materialize() response_dict = fs.get_online_features([f"{fv.name}:value"], [{ "driver_id": 1 }]).to_dict() assert abs(response_dict[f"{fv.name}:value"][0] - 0.3) < 1e-6 # check prior value for materialize_incremental() response_dict = fs.get_online_features([f"{fv.name}:value"], [{ "driver_id": 3 }]).to_dict() assert abs(response_dict[f"{fv.name}:value"][0] - 4) < 1e-6 # run materialize_incremental() fs.materialize_incremental( [fv.name], now - timedelta(seconds=0), ) # check result of materialize_incremental() response_dict = fs.get_online_features([f"{fv.name}:value"], [{ "driver_id": 3 }]).to_dict() assert abs(response_dict[f"{fv.name}:value"][0] - 5) < 1e-6
driver = Entity( name="driver_id", value_type=ValueType.INT64, description="driver id", ) # Our parquet files contain sample data that includes a driver_id column, timestamps and # three feature column. Here we define a Feature View that will allow us to serve this # data to our model online. driver_hourly_stats_view = FeatureView( name="driver_hourly_stats", entities=["driver_id"], ttl=Duration(seconds=86400 * 7), schema=[ Field(name="conv_rate", dtype=Float64), Field(name="acc_rate", dtype=Float32), Field(name="avg_daily_trips", dtype=Int64), ], online=True, batch_source=driver_hourly_stats, tags={}, ) input_request = RequestSource( name="vals_to_add", schema=[ Field(name="val_to_add", dtype=Int64), Field(name="val_to_add_2", dtype=Int64), Field(name="avg_daily_trips", dtype=Int64), ], )
def test_historical_features_from_bigquery_sources_containing_backfills( capsys): now = datetime.now().replace(microsecond=0, second=0, minute=0) tomorrow = now + timedelta(days=1) entity_dataframe = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": now + timedelta(days=2) }, { "driver_id": 1002, "event_timestamp": now + timedelta(days=2) }, ]) driver_stats_df = pd.DataFrame(data=[ # Duplicated rows simple case { "driver_id": 1001, "avg_daily_trips": 10, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1001, "avg_daily_trips": 20, "event_timestamp": tomorrow, "created": tomorrow, }, # Duplicated rows after a backfill { "driver_id": 1002, "avg_daily_trips": 30, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1002, "avg_daily_trips": 40, "event_timestamp": tomorrow, "created": now, }, ]) expected_df = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": now + timedelta(days=2), "avg_daily_trips": 20, }, { "driver_id": 1002, "event_timestamp": now + timedelta(days=2), "avg_daily_trips": 40, }, ]) bigquery_dataset = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: gcp_project = bigquery.Client().project # Entity Dataframe SQL query table_id = f"{bigquery_dataset}.orders" stage_orders_bigquery(entity_dataframe, table_id) entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" # Driver Feature View driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" stage_driver_hourly_stats_bigquery_source(driver_stats_df, driver_table_id) store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig(type="bigquery", dataset=bigquery_dataset), )) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) driver_fv = FeatureView( name="driver_stats", entities=["driver"], features=[Feature(name="avg_daily_trips", dtype=ValueType.INT32)], batch_source=BigQuerySource( table_ref=driver_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ), ttl=None, ) store.apply([driver, driver_fv]) try: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=["driver_stats:avg_daily_trips"], full_feature_names=False, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=["driver_id"]).reset_index( drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=["driver_id"]).reset_index(drop=True), check_dtype=False, ) finally: store.teardown()
def test_bigquery_ingestion_correctness(self): # create dataset ts = pd.Timestamp.now(tz="UTC").round("ms") checked_value = ( random.random() ) # random value so test doesn't still work if no values written to online store data = { "id": [1, 2, 1], "value": [0.1, 0.2, checked_value], "ts_1": [ts - timedelta(minutes=2), ts, ts], "created_ts": [ts, ts, ts], } df = pd.DataFrame.from_dict(data) # load dataset into BigQuery job_config = bigquery.LoadJobConfig() table_id = ( f"{self.gcp_project}.{self.bigquery_dataset}.correctness_{int(time.time())}" ) job = self.client.load_table_from_dataframe(df, table_id, job_config=job_config) job.result() # create FeatureView fv = FeatureView( name="test_bq_correctness", entities=["driver_id"], features=[Feature("value", ValueType.FLOAT)], ttl=timedelta(minutes=5), input=BigQuerySource( event_timestamp_column="ts", table_ref=table_id, created_timestamp_column="created_ts", field_mapping={ "ts_1": "ts", "id": "driver_id" }, date_partition_column="", ), ) config = RepoConfig( metadata_store="./metadata.db", project="default", provider="gcp", online_store=OnlineStoreConfig( local=LocalOnlineStoreConfig("online_store.db")), ) fs = FeatureStore(config=config) fs.apply([fv]) # run materialize() fs.materialize( ["test_bq_correctness"], datetime.utcnow() - timedelta(minutes=5), datetime.utcnow() - timedelta(minutes=0), ) # check result of materialize() entity_key = EntityKeyProto(entity_names=["driver_id"], entity_values=[ValueProto(int64_val=1)]) t, val = fs._get_provider().online_read("default", fv, entity_key) assert abs(val["value"].double_val - checked_value) < 1e-6
def from_proto(cls, on_demand_feature_view_proto: OnDemandFeatureViewProto): """ Creates an on demand feature view from a protobuf representation. Args: on_demand_feature_view_proto: A protobuf representation of an on-demand feature view. Returns: A OnDemandFeatureView object based on the on-demand feature view protobuf. """ sources = [] for ( _, on_demand_source, ) in on_demand_feature_view_proto.spec.sources.items(): if on_demand_source.WhichOneof("source") == "feature_view": sources.append( FeatureView.from_proto( on_demand_source.feature_view).projection) elif on_demand_source.WhichOneof( "source") == "feature_view_projection": sources.append( FeatureViewProjection.from_proto( on_demand_source.feature_view_projection)) else: sources.append( RequestSource.from_proto( on_demand_source.request_data_source)) on_demand_feature_view_obj = cls( name=on_demand_feature_view_proto.spec.name, schema=[ Field( name=feature.name, dtype=from_value_type(ValueType(feature.value_type)), ) for feature in on_demand_feature_view_proto.spec.features ], sources=sources, udf=dill.loads( on_demand_feature_view_proto.spec.user_defined_function.body), description=on_demand_feature_view_proto.spec.description, tags=dict(on_demand_feature_view_proto.spec.tags), owner=on_demand_feature_view_proto.spec.owner, ) # FeatureViewProjections are not saved in the OnDemandFeatureView proto. # Create the default projection. on_demand_feature_view_obj.projection = FeatureViewProjection.from_definition( on_demand_feature_view_obj) if on_demand_feature_view_proto.meta.HasField("created_timestamp"): on_demand_feature_view_obj.created_timestamp = ( on_demand_feature_view_proto.meta.created_timestamp.ToDatetime( )) if on_demand_feature_view_proto.meta.HasField( "last_updated_timestamp"): on_demand_feature_view_obj.last_updated_timestamp = ( on_demand_feature_view_proto.meta.last_updated_timestamp. ToDatetime()) return on_demand_feature_view_obj
def test_apply_feature_view_integration(test_registry): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) project = "project" # Register Feature View test_registry.apply_feature_view(fv1, project) feature_views = test_registry.list_feature_views(project) # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_registry.get_feature_view("my_feature_view_1", project) assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == ValueType.INT64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == ValueType.STRING and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == ValueType.STRING_LIST and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == ValueType.BYTES_LIST and feature_view.entities[0] == "fs1_my_entity_1") test_registry.delete_feature_view("my_feature_view_1", project) feature_views = test_registry.list_feature_views(project) assert len(feature_views) == 0 test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def test_hash(): file_source = FileSource(name="my-file-source", path="test.parquet") feature_view = FeatureView( name="my-feature-view", entities=[], schema=[ Field(name="feature1", dtype=Float32), Field(name="feature2", dtype=Float32), ], source=file_source, ) sources = [feature_view] on_demand_feature_view_1 = OnDemandFeatureView( name="my-on-demand-feature-view", sources=sources, schema=[ Field(name="output1", dtype=Float32), Field(name="output2", dtype=Float32), ], udf=udf1, ) on_demand_feature_view_2 = OnDemandFeatureView( name="my-on-demand-feature-view", sources=sources, schema=[ Field(name="output1", dtype=Float32), Field(name="output2", dtype=Float32), ], udf=udf1, ) on_demand_feature_view_3 = OnDemandFeatureView( name="my-on-demand-feature-view", sources=sources, schema=[ Field(name="output1", dtype=Float32), Field(name="output2", dtype=Float32), ], udf=udf2, ) on_demand_feature_view_4 = OnDemandFeatureView( name="my-on-demand-feature-view", sources=sources, schema=[ Field(name="output1", dtype=Float32), Field(name="output2", dtype=Float32), ], udf=udf2, description="test", ) s1 = {on_demand_feature_view_1, on_demand_feature_view_2} assert len(s1) == 1 s2 = {on_demand_feature_view_1, on_demand_feature_view_3} assert len(s2) == 2 s3 = {on_demand_feature_view_3, on_demand_feature_view_4} assert len(s3) == 2 s4 = { on_demand_feature_view_1, on_demand_feature_view_2, on_demand_feature_view_3, on_demand_feature_view_4, } assert len(s4) == 3