def test_offline_ingestion( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def transactions_feature_table(spark, client): schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_transactions", DoubleType()), StructField("is_vip", BooleanType()), ]) df_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 50.0, True, ), ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 100.0, True, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 400.0, False, ), ( 1001, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=1), 200.0, False, ), ( 1001, datetime(year=2020, month=9, day=4), datetime(year=2020, month=9, day=1), 300.0, False, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "transactions", schema, df_data) file_source = FileSource("event_timestamp", "created_timestamp", ParquetFormat(), file_uri) features = [ Feature("total_transactions", ValueType.DOUBLE), Feature("is_vip", ValueType.BOOL), ] feature_table = FeatureTable("transactions", ["customer_id"], features, batch_source=file_source) yield client.apply_feature_table(feature_table) shutil.rmtree(temp_dir)
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity(name="long_entity_name" * 10, description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="just1a2featuretable3with4a5really6really7really8really9really10", entities=[entity.name], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data().rename(columns={"s2id": entity.name}) feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ entity.name: key } for key in data_sample[entity.name].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[[entity.name, f"{feature_table.name}:unique_drivers"]], data_sample[[entity.name, "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client, feast_spark_client: SparkClient): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) batch_source = FileSource( file_format=ParquetFormat(), file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) feature_table = FeatureTable( name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"), entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "0 0 * * *") config.load_incluster_config() k8s_api = client.CustomObjectsApi() def get_scheduled_spark_application(): job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}". encode()).hexdigest() resource_name = f"feast-{job_hash}" return k8s_api.get_namespaced_custom_object( group="sparkoperator.k8s.io", version="v1beta2", namespace=pytestconfig.getoption("k8s_namespace"), plural="scheduledsparkapplications", name=resource_name, ) response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "0 0 * * *" feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "1 0 * * *") response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "1 0 * * *" feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
def test_historical_features( feast_client: Client, batch_source: Union[BigQuerySource, FileSource] ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] job = feast_client.get_historical_features(feature_refs, customers_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), )
def bookings_feature_table_with_mapping(spark, client): schema = StructType([ StructField("id", IntegerType()), StructField("datetime", TimestampType()), StructField("created_datetime", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 100, ), ( 8001, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 150, ), ( 8002, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource( event_timestamp_column="datetime", created_timestamp_column="created_datetime", file_format=ParquetFormat(), file_url=file_uri, field_mapping={"id": "driver_id"}, ) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply(feature_table) shutil.rmtree(temp_dir)
def create_schema(kafka_broker, topic_name, feature_table_name): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) feature_table = FeatureTable( name=feature_table_name, entities=["key"], features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) return entity, feature_table
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name= "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data() feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids
def bookings_feature_table(spark, client): schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100, ), ( 8001, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=2), 150, ), ( 8002, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=2), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource("event_timestamp", "created_timestamp", "parquet", file_uri) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply_feature_table(feature_table) shutil.rmtree(temp_dir)
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset, feast_client: Client, feast_spark_client: SparkClient): original = generate_data() bq_project = pytestconfig.getoption("bq_project") bq_client = bigquery.Client(project=bq_project) source_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}", ) bq_client.load_table_from_dataframe(original, source_ref).result() view_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(view_ref) view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`" bq_client.create_table(view) entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="bq_ingestion", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}", ), ) feast_client.apply(entity) feast_client.apply(feature_table) ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def test_historical_features( feast_client: Client, tfrecord_feast_client: Client, batch_source: Union[BigQuerySource, FileSource], ): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] # remove microseconds because job.get_start_time() does not contain microseconds job_submission_time = datetime.utcnow().replace(microsecond=0) job = feast_client.get_historical_features(feature_refs, customers_df) assert job.get_start_time() >= job_submission_time assert job.get_start_time() <= job_submission_time + timedelta(hours=1) output_dir = job.get_output_file_uri() # will both be None if not using Azure blob storage account_name, account_key = _get_azure_creds(feast_client) joined_df = read_parquet(output_dir, azure_account_name=account_name, azure_account_key=account_key) expected_joined_df = pd.DataFrame({ "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), ) job = tfrecord_feast_client.get_historical_features( feature_refs, customers_df) job.get_output_file_uri() assert job.get_status() == SparkJobStatus.COMPLETED
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( "event_timestamp", "event_timestamp", kafka_broker, AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60) try: original = generate_data()[[ "s2id", "unique_drivers", "event_timestamp" ]] for record in original.to_dict("records"): record["event_timestamp"] = ( record["event_timestamp"].to_pydatetime().replace( tzinfo=pytz.utc)) send_avro_record_to_kafka( topic_name, record, bootstrap_servers=kafka_broker, avro_schema_json=avro_schema(), ) def get_online_features(): features = feast_client.get_online_features( ["drivers_stream:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() df = pd.DataFrame.from_dict(features) return df, not df["drivers_stream:unique_drivers"].isna().any() ingested = wait_retry_backoff(get_online_features, 60) finally: job.cancel() pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )
def test_validation_with_ge(feast_client: Client, kafka_server): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="validation_test", entities=["key"], features=[ Feature("num", ValueType.INT64), Feature("set", ValueType.STRING) ], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_test:num", "validation_test:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_test:num", "validation_test:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_test:num", "set": "validation_test:set" }), )
def test_historical_features(feast_client: Client, local_staging_path: str): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), os.path.join(local_staging_path, "transactions"), ), max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) retrieval_date = (datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None)) retrieval_outside_max_age_date = retrieval_date + timedelta(1) event_date = retrieval_date - timedelta(2) creation_date = retrieval_date - timedelta(1) customers = [1001, 1002, 1003, 1004, 1005] daily_transactions = [np.random.rand() * 10 for _ in customers] total_transactions = [np.random.rand() * 100 for _ in customers] transactions_df = pd.DataFrame({ "event_timestamp": [event_date for _ in customers], "created_timestamp": [creation_date for _ in customers], "user_id": customers, "daily_transactions": daily_transactions, "total_transactions": total_transactions, }) feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] customer_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, }) job = feast_client.get_historical_features(feature_refs, customer_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, "transactions__daily_transactions": daily_transactions + [None] * len(customers), }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), )
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server, pytestconfig): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply(entity) feast_client.apply(feature_table) if not pytestconfig.getoption("scheduled_streaming_job"): job = feast_client.start_stream_to_online_ingestion(feature_table) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) else: job = None wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300) test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=[{ "s2id": s2_id } for s2_id in test_data["s2id"].tolist()], feature_names=["drivers_stream:unique_drivers"], ) finally: if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name) pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], test_data[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )
from google.protobuf.duration_pb2 import Duration from feast import BigQuerySource, Entity, Feature, FeatureTable, ValueType driver_locations_source = BigQuerySource( table_ref="rh_prod.ride_hailing_co.drivers", event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", ) driver = Entity( name="driver", # The name is derived from this argument, not object name. value_type=ValueType.INT64, description="driver id", ) driver_locations = FeatureTable( name="driver_locations", entities=["driver"], max_age=Duration(seconds=86400 * 1), features=[ Feature(name="lat", dtype=ValueType.FLOAT), Feature(name="lon", dtype=ValueType.STRING), ], batch_source=driver_locations_source, )