def prep_bq_fs_and_fv( bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = bigquery.Client() gcp_project = client.project bigquery_dataset = "test_ingestion" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14 ) # 2 weeks in milliseconds client.update_dataset(dataset, ["default_table_expiration_ms"]) df = create_dataset() job_config = bigquery.LoadJobConfig() table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}" query = f"SELECT * FROM `{table_ref}`" job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) job.result() bigquery_source = BigQuerySource( table_ref=table_ref if bq_source_type == "table" else None, query=query if bq_source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(bigquery_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="gcp", online_store=DatastoreOnlineStoreConfig( namespace="integration_test"), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), path=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}" print(f"Using project: {project}") with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider="local", online_store=RedisOnlineStoreConfig( type="redis", redis_type=RedisType.redis, connection_string="localhost:6379,db=0", ), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def prep_redshift_fs_and_fv( source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") df = create_dataset() table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}" offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) aws_utils.upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, df, ) redshift_source = RedshiftSource( table=table_name if source_type == "table" else None, query=f"SELECT * FROM {table_name}" if source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(redshift_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db")), offline_store=offline_store, ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown() # Clean up the uploaded Redshift table aws_utils.execute_redshift_statement( client, offline_store.cluster_id, offline_store.database, offline_store.user, f"DROP TABLE {table_name}", )
def teardown(repo_config: RepoConfig, repo_path: Path): # Cannot pass in both repo_path and repo_config to FeatureStore. feature_store = FeatureStore(repo_path=repo_path, config=None) feature_store.teardown()
def test_historical_features_from_bigquery_sources_containing_backfills( capsys): now = datetime.now().replace(microsecond=0, second=0, minute=0) tomorrow = now + timedelta(days=1) entity_dataframe = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": now + timedelta(days=2) }, { "driver_id": 1002, "event_timestamp": now + timedelta(days=2) }, ]) driver_stats_df = pd.DataFrame(data=[ # Duplicated rows simple case { "driver_id": 1001, "avg_daily_trips": 10, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1001, "avg_daily_trips": 20, "event_timestamp": tomorrow, "created": tomorrow, }, # Duplicated rows after a backfill { "driver_id": 1002, "avg_daily_trips": 30, "event_timestamp": now, "created": tomorrow, }, { "driver_id": 1002, "avg_daily_trips": 40, "event_timestamp": tomorrow, "created": now, }, ]) expected_df = pd.DataFrame(data=[ { "driver_id": 1001, "event_timestamp": now + timedelta(days=2), "avg_daily_trips": 20, }, { "driver_id": 1002, "event_timestamp": now + timedelta(days=2), "avg_daily_trips": 40, }, ]) bigquery_dataset = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: gcp_project = bigquery.Client().project # Entity Dataframe SQL query table_id = f"{bigquery_dataset}.orders" stage_orders_bigquery(entity_dataframe, table_id) entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" # Driver Feature View driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" stage_driver_hourly_stats_bigquery_source(driver_stats_df, driver_table_id) store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig(type="bigquery", dataset=bigquery_dataset), )) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) driver_fv = FeatureView( name="driver_stats", entities=["driver"], features=[Feature(name="avg_daily_trips", dtype=ValueType.INT32)], batch_source=BigQuerySource( table_ref=driver_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ), ttl=None, ) store.apply([driver, driver_fv]) try: job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=["driver_stats:avg_daily_trips"], full_feature_names=False, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=["driver_id"]).reset_index( drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=["driver_id"]).reset_index(drop=True), check_dtype=False, ) finally: store.teardown()
def test_historical_features_from_redshift_sources(provider_type, infer_event_timestamp_col, capsys, full_feature_names): client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) redshift_table_prefix = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) # Stage orders_df to Redshift table_name = f"{redshift_table_prefix}_orders" entity_df_query = f"SELECT * FROM {table_name}" orders_context = aws_utils.temporarily_upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, orders_df, ) # Stage driver_df to Redshift driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date) driver_table_name = f"{redshift_table_prefix}_driver_hourly" driver_context = aws_utils.temporarily_upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{driver_table_name}.parquet", offline_store.iam_role, driver_table_name, driver_df, ) # Stage customer_df to Redshift customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date) customer_table_name = f"{redshift_table_prefix}_customer_profile" customer_context = aws_utils.temporarily_upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{customer_table_name}.parquet", offline_store.iam_role, customer_table_name, customer_df, ) with orders_context, driver_context, customer_context, TemporaryDirectory( ) as temp_dir: driver_source = RedshiftSource( table=driver_table_name, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_fv = create_driver_hourly_stats_feature_view(driver_source) customer_source = RedshiftSource( table=customer_table_name, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) customer_fv = create_customer_daily_profile_feature_view( customer_source) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) if provider_type == "local": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=os.path.join( temp_dir, "online_store.db"), ), offline_store=offline_store, )) elif provider_type == "aws": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="aws", online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), offline_store=offline_store, )) else: raise Exception( "Invalid provider used as part of test configuration") store.apply([driver, customer, driver_fv, customer_fv]) try: event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts") expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, full_feature_names, ) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() assert_frame_equal( actual_df_from_sql_entities.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), table_from_sql_entities.to_pandas().sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), ) timestamp_column = ("e_ts" if infer_event_timestamp_col else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL) entity_df_query_with_invalid_join_key = ( f"select order_id, driver_id, customer_id as customer, " f"order_is_success, {timestamp_column} FROM {table_name}") # Rename the join key; this should now raise an error. assertpy.assert_that( store.get_historical_features( entity_df=entity_df_query_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ).to_df).raises(errors.FeastEntityDFMissingColumnsError ).when_called_with() job_from_df = store.get_historical_features( entity_df=orders_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) # Rename the join key; this should now raise an error. orders_df_with_invalid_join_key = orders_df.rename( {"customer_id": "customer"}, axis="columns") assertpy.assert_that( store.get_historical_features( entity_df=orders_df_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ).to_df).raises(errors.FeastEntityDFMissingColumnsError ).when_called_with() start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_df_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_df_entities = job_from_df.to_arrow() assert_frame_equal( actual_df_from_df_entities.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), table_from_df_entities.to_pandas().sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), ) finally: store.teardown()
def test_historical_features_from_bigquery_sources(provider_type, infer_event_timestamp_col, capsys, full_feature_names): start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) bigquery_dataset = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: gcp_project = bigquery.Client().project # Orders Query table_id = f"{bigquery_dataset}.orders" stage_orders_bigquery(orders_df, table_id) entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" # Driver Feature View driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date) driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" stage_driver_hourly_stats_bigquery_source(driver_df, driver_table_id) driver_source = BigQuerySource( table_ref=driver_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_fv = create_driver_hourly_stats_feature_view(driver_source) # Customer Feature View customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date) customer_table_id = f"{gcp_project}.{bigquery_dataset}.customer_profile" stage_customer_daily_profile_bigquery_source(customer_df, customer_table_id) customer_source = BigQuerySource( table_ref=customer_table_id, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) customer_fv = create_customer_daily_profile_feature_view( customer_source) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) if provider_type == "local": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=os.path.join( temp_dir, "online_store.db"), ), offline_store=BigQueryOfflineStoreConfig( type="bigquery", dataset=bigquery_dataset), )) elif provider_type == "gcp": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig( type="bigquery", dataset=bigquery_dataset), )) elif provider_type == "gcp_custom_offline_config": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="gcp", offline_store=BigQueryOfflineStoreConfig(type="bigquery", dataset="foo"), )) else: raise Exception( "Invalid provider used as part of test configuration") store.apply([driver, customer, driver_fv, customer_fv]) try: event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts") expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, full_feature_names, ) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() assert_frame_equal(actual_df_from_sql_entities, table_from_sql_entities.to_pandas()) timestamp_column = ("e_ts" if infer_event_timestamp_col else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL) entity_df_query_with_invalid_join_key = ( f"select order_id, driver_id, customer_id as customer, " f"order_is_success, {timestamp_column}, FROM {gcp_project}.{table_id}" ) # Rename the join key; this should now raise an error. assertpy.assert_that(store.get_historical_features).raises( errors.FeastEntityDFMissingColumnsError).when_called_with( entity_df=entity_df_query_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) job_from_df = store.get_historical_features( entity_df=orders_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) # Rename the join key; this should now raise an error. orders_df_with_invalid_join_key = orders_df.rename( {"customer_id": "customer"}, axis="columns") assertpy.assert_that(store.get_historical_features).raises( errors.FeastEntityDFMissingColumnsError).when_called_with( entity_df=orders_df_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ) # Make sure that custom dataset name is being used from the offline_store config if provider_type == "gcp_custom_offline_config": assertpy.assert_that( job_from_df.query).contains("foo.feast_entity_df") else: assertpy.assert_that(job_from_df.query).contains( f"{bigquery_dataset}.feast_entity_df") start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_df_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_df_entities = job_from_df.to_arrow() assert_frame_equal(actual_df_from_df_entities, table_from_df_entities.to_pandas()) finally: store.teardown()
def test_historical_features_from_parquet_sources(infer_event_timestamp_col, full_feature_names): start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) with TemporaryDirectory() as temp_dir: driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date) driver_source = stage_driver_hourly_stats_parquet_source( temp_dir, driver_df) driver_fv = create_driver_hourly_stats_feature_view(driver_source) customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date) customer_source = stage_customer_daily_profile_parquet_source( temp_dir, customer_df) customer_fv = create_customer_daily_profile_feature_view( customer_source) customer_fs = feature_service( "customer_feature_service", [ customer_fv[[ "current_balance", "avg_passenger_count", "lifetime_trip_count" ]], driver_fv[["conv_rate", "avg_daily_trips"]], ], ) print(f"Customer fs features: {customer_fs.features}") driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig( path=os.path.join(temp_dir, "online_store.db")), )) store.apply([driver, customer, driver_fv, customer_fv, customer_fs]) job = store.get_historical_features( entity_df=orders_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) actual_df = job.to_df() event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts") expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, full_feature_names=full_feature_names, ) expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True) expected_df = expected_df.reindex(sorted(expected_df.columns), axis=1) actual_df = actual_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True) actual_df = actual_df.reindex(sorted(actual_df.columns), axis=1) assert_frame_equal( expected_df, actual_df, ) feature_service_job = store.get_historical_features( entity_df=orders_df, features=customer_fs, full_feature_names=full_feature_names, ) feature_service_df = feature_service_job.to_df() feature_service_df = feature_service_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True) feature_service_df = feature_service_df.reindex(sorted( feature_service_df.columns), axis=1) assert_frame_equal(expected_df, feature_service_df) store.teardown()