def test_historical_features( feast_client: Client, batch_source: Union[BigQuerySource, FileSource] ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] job = feast_client.get_historical_features(feature_refs, customers_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), )
def test_historical_features( feast_client: Client, tfrecord_feast_client: Client, batch_source: Union[BigQuerySource, FileSource], ): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] # remove microseconds because job.get_start_time() does not contain microseconds job_submission_time = datetime.utcnow().replace(microsecond=0) job = feast_client.get_historical_features(feature_refs, customers_df) assert job.get_start_time() >= job_submission_time assert job.get_start_time() <= job_submission_time + timedelta(hours=1) output_dir = job.get_output_file_uri() # will both be None if not using Azure blob storage account_name, account_key = _get_azure_creds(feast_client) joined_df = read_parquet(output_dir, azure_account_name=account_name, azure_account_key=account_key) expected_joined_df = pd.DataFrame({ "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), ) job = tfrecord_feast_client.get_historical_features( feature_refs, customers_df) job.get_output_file_uri() assert job.get_status() == SparkJobStatus.COMPLETED
def test_historical_features(feast_client: Client, local_staging_path: str): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), os.path.join(local_staging_path, "transactions"), ), max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) retrieval_date = (datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None)) retrieval_outside_max_age_date = retrieval_date + timedelta(1) event_date = retrieval_date - timedelta(2) creation_date = retrieval_date - timedelta(1) customers = [1001, 1002, 1003, 1004, 1005] daily_transactions = [np.random.rand() * 10 for _ in customers] total_transactions = [np.random.rand() * 100 for _ in customers] transactions_df = pd.DataFrame({ "event_timestamp": [event_date for _ in customers], "created_timestamp": [creation_date for _ in customers], "user_id": customers, "daily_transactions": daily_transactions, "total_transactions": total_transactions, }) feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] customer_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, }) job = feast_client.get_historical_features(feature_refs, customer_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, "transactions__daily_transactions": daily_transactions + [None] * len(customers), }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), )