def test_update_featureset_apply_featureset_and_ingest_first_subset( client, update_featureset_dataframe): subset_columns = [ "datetime", "entity_id", "update_feature1", "update_feature2" ] subset_df = update_featureset_dataframe.iloc[:5][subset_columns] update_fs = FeatureSet( "update_fs", entities=[Entity(name="entity_id", dtype=ValueType.INT64)], max_age=Duration(seconds=432000), ) update_fs.infer_fields_from_df(subset_df) client.apply(update_fs) client.ingest(feature_set=update_fs, source=subset_df) time.sleep(15) feature_retrieval_job = client.get_batch_features( entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5], feature_refs=[ f"{PROJECT_NAME}/update_feature1", f"{PROJECT_NAME}/update_feature2", ], ) output = feature_retrieval_job.to_dataframe().sort_values(by=["entity_id"]) print(output.head()) assert output["update_feature1"].to_list( ) == subset_df["update_feature1"].to_list() assert output["update_feature2"].to_list( ) == subset_df["update_feature2"].to_list()
def test_update_featureset_update_featureset_and_ingest_second_subset( client, update_featureset_dataframe): subset_columns = [ "datetime", "entity_id", "update_feature1", "update_feature3", "update_feature4", ] subset_df = update_featureset_dataframe.iloc[5:][subset_columns] update_fs = FeatureSet( "update_fs", entities=[Entity(name="entity_id", dtype=ValueType.INT64)], max_age=Duration(seconds=432000), ) update_fs.infer_fields_from_df(subset_df) client.apply(update_fs) # We keep retrying this ingestion until all values make it into the buffer. # This is a necessary step because bigquery streaming caches table schemas # and as a result, rows may be lost. while True: ingestion_id = client.ingest(feature_set=update_fs, source=subset_df) time.sleep(15) # wait for rows to get written to bq rows_ingested = get_rows_ingested(client, update_fs, ingestion_id) if rows_ingested == len(subset_df): print( f"Number of rows successfully ingested: {rows_ingested}. Continuing." ) break print( f"Number of rows successfully ingested: {rows_ingested}. Retrying ingestion." ) time.sleep(30) def check(): feature_retrieval_job = client.get_batch_features( entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[5:], feature_refs=[ "update_feature1", "update_feature3", "update_feature4", ], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe( timeout_sec=180).sort_values(by=["entity_id"]) print(output.head()) assert output["update_feature1"].to_list( ) == subset_df["update_feature1"].to_list() assert output["update_feature3"].to_list( ) == subset_df["update_feature3"].to_list() assert output["update_feature4"].to_list( ) == subset_df["update_feature4"].to_list() clean_up_remote_files(feature_retrieval_job.get_avro_files()) wait_for(check, timedelta(minutes=5))
def test_feature_set_ingest_failure(self, client, dataframe, exception): with pytest.raises(exception): # Create feature set driver_fs = FeatureSet("driver-feature-set") # Update based on dataset driver_fs.infer_fields_from_df(dataframe) # Register with Feast core client.apply(driver_fs) # Ingest data into Feast client.ingest(driver_fs, dataframe=dataframe)
def test_feature_set_ingest_failure(self, client, dataframe, exception): with pytest.raises(exception): # Create feature set driver_fs = FeatureSet("driver-feature-set") driver_fs.source = KafkaSource(topic="feature-topic", brokers="fake.broker.com") client._message_producer = MagicMock() client._message_producer.produce = MagicMock() # Update based on dataset driver_fs.infer_fields_from_df(dataframe) # Register with Feast core client.apply(driver_fs) # Ingest data into Feast client.ingest(driver_fs, dataframe=dataframe)
def test_add_features_from_df_success( self, dataframe, feature_count, entity_count, discard_unused_fields, features, entities, ): my_feature_set = FeatureSet( name="my_feature_set", features=[Feature(name="dummy_f1", dtype=ValueType.INT64)], entities=[Entity(name="dummy_entity_1", dtype=ValueType.INT64)], ) my_feature_set.infer_fields_from_df( dataframe, discard_unused_fields=discard_unused_fields, features=features, entities=entities, ) assert len(my_feature_set.features) == feature_count assert len(my_feature_set.entities) == entity_count
def test_update_featureset_apply_featureset_and_ingest_first_subset( client, update_featureset_dataframe ): subset_columns = ["datetime", "entity_id", "update_feature1", "update_feature2"] subset_df = update_featureset_dataframe.iloc[:5][subset_columns] update_fs = FeatureSet( "update_fs", entities=[Entity(name="entity_id", dtype=ValueType.INT64)], max_age=Duration(seconds=432000), ) update_fs.infer_fields_from_df(subset_df) client.apply(update_fs) client.ingest(feature_set=update_fs, source=subset_df) def check(): feature_retrieval_job = client.get_historical_features( entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5], feature_refs=["update_feature1", "update_feature2"], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( by=["entity_id"] ) print(output.head()) assert ( output["update_feature1"].to_list() == subset_df["update_feature1"].to_list() ) assert ( output["update_feature2"].to_list() == subset_df["update_feature2"].to_list() ) clean_up_remote_files(feature_retrieval_job.get_avro_files()) wait_for(check, timedelta(minutes=5))
def test_update_from_source_failure(self): with pytest.raises(Exception): df = pd.DataFrame() fs = FeatureSet("driver-feature-set") fs.infer_fields_from_df(df)