def create_data_source( self, df: pd.DataFrame, destination_name: str, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: destination_name = self.get_prefixed_table_name(destination_name) aws_utils.upload_df_to_redshift( self.client, self.offline_store_config.cluster_id, self.offline_store_config.database, self.offline_store_config.user, self.s3, f"{self.offline_store_config.s3_staging_location}/copy/{destination_name}.parquet", self.offline_store_config.iam_role, destination_name, df, ) self.tables.append(destination_name) return RedshiftSource( table=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, )
def prep_redshift_fs_and_fv( source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") df = create_dataset() table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}" offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) aws_utils.upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, df, ) redshift_source = RedshiftSource( table=table_name if source_type == "table" else None, query=f"SELECT * FROM {table_name}" if source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(redshift_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db")), offline_store=offline_store, ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown() # Clean up the uploaded Redshift table aws_utils.execute_redshift_statement( client, offline_store.cluster_id, offline_store.database, offline_store.user, f"DROP TABLE {table_name}", )
name="driver_id", # The join key of an entity describes the storage level field/column on which # features can be looked up. The join key is also used to join feature # tables/views when building feature vectors join_key="driver_id", # The storage level type for an entity value_type=ValueType.INT64, ) # Indicates a data source from which feature values can be retrieved. Sources are queried when building training # datasets or materializing features into an online store. driver_stats_source = RedshiftSource( # The Redshift table where features can be found table="feast_driver_hourly_stats", # The event timestamp is used for point-in-time joins and for ensuring only # features within the TTL are returned event_timestamp_column="event_timestamp", # The (optional) created timestamp is used to ensure there are no duplicate # feature rows in the offline store or when building training datasets created_timestamp_column="created", ) # Feature views are a grouping based on how features are stored in either the # online or offline store. driver_stats_fv = FeatureView( # The unique name of this feature view. Two feature views in a single # project cannot have the same name name="driver_hourly_stats", # The list of entities specifies the keys required for joining or looking # up features from this feature view. The reference provided in this field # correspond to the name of a defined entity (or entities) entities=["driver_id"],
def test_historical_features_from_redshift_sources(provider_type, infer_event_timestamp_col, capsys, full_feature_names): client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) start_date = datetime.now().replace(microsecond=0, second=0, minute=0) ( customer_entities, driver_entities, end_date, orders_df, start_date, ) = generate_entities(start_date, infer_event_timestamp_col) redshift_table_prefix = ( f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" ) # Stage orders_df to Redshift table_name = f"{redshift_table_prefix}_orders" entity_df_query = f"SELECT * FROM {table_name}" orders_context = aws_utils.temporarily_upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, orders_df, ) # Stage driver_df to Redshift driver_df = driver_data.create_driver_hourly_stats_df( driver_entities, start_date, end_date) driver_table_name = f"{redshift_table_prefix}_driver_hourly" driver_context = aws_utils.temporarily_upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{driver_table_name}.parquet", offline_store.iam_role, driver_table_name, driver_df, ) # Stage customer_df to Redshift customer_df = driver_data.create_customer_daily_profile_df( customer_entities, start_date, end_date) customer_table_name = f"{redshift_table_prefix}_customer_profile" customer_context = aws_utils.temporarily_upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{customer_table_name}.parquet", offline_store.iam_role, customer_table_name, customer_df, ) with orders_context, driver_context, customer_context, TemporaryDirectory( ) as temp_dir: driver_source = RedshiftSource( table=driver_table_name, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_fv = create_driver_hourly_stats_feature_view(driver_source) customer_source = RedshiftSource( table=customer_table_name, event_timestamp_column="event_timestamp", created_timestamp_column="created", ) customer_fv = create_customer_daily_profile_feature_view( customer_source) driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) customer = Entity(name="customer_id", value_type=ValueType.INT64) if provider_type == "local": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="default", provider="local", online_store=SqliteOnlineStoreConfig(path=os.path.join( temp_dir, "online_store.db"), ), offline_store=offline_store, )) elif provider_type == "aws": store = FeatureStore(config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project="".join( random.choices(string.ascii_uppercase + string.digits, k=10)), provider="aws", online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), offline_store=offline_store, )) else: raise Exception( "Invalid provider used as part of test configuration") store.apply([driver, customer, driver_fv, customer_fv]) try: event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns else "e_ts") expected_df = get_expected_training_df( customer_df, customer_fv, driver_df, driver_fv, orders_df, event_timestamp, full_feature_names, ) job_from_sql = store.get_historical_features( entity_df=entity_df_query, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) start_time = datetime.utcnow() actual_df_from_sql_entities = job_from_sql.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" )) assert sorted(expected_df.columns) == sorted( actual_df_from_sql_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_sql_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_sql_entities = job_from_sql.to_arrow() assert_frame_equal( actual_df_from_sql_entities.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), table_from_sql_entities.to_pandas().sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), ) timestamp_column = ("e_ts" if infer_event_timestamp_col else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL) entity_df_query_with_invalid_join_key = ( f"select order_id, driver_id, customer_id as customer, " f"order_is_success, {timestamp_column} FROM {table_name}") # Rename the join key; this should now raise an error. assertpy.assert_that( store.get_historical_features( entity_df=entity_df_query_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ).to_df).raises(errors.FeastEntityDFMissingColumnsError ).when_called_with() job_from_df = store.get_historical_features( entity_df=orders_df, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], full_feature_names=full_feature_names, ) # Rename the join key; this should now raise an error. orders_df_with_invalid_join_key = orders_df.rename( {"customer_id": "customer"}, axis="columns") assertpy.assert_that( store.get_historical_features( entity_df=orders_df_with_invalid_join_key, features=[ "driver_stats:conv_rate", "driver_stats:avg_daily_trips", "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", ], ).to_df).raises(errors.FeastEntityDFMissingColumnsError ).when_called_with() start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() end_time = datetime.utcnow() with capsys.disabled(): print( str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" )) assert sorted(expected_df.columns) == sorted( actual_df_from_df_entities.columns) assert_frame_equal( expected_df.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), actual_df_from_df_entities[expected_df.columns].sort_values( by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), check_dtype=False, ) table_from_df_entities = job_from_df.to_arrow() assert_frame_equal( actual_df_from_df_entities.sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), table_from_df_entities.to_pandas().sort_values(by=[ event_timestamp, "order_id", "driver_id", "customer_id" ]).reset_index(drop=True), ) finally: store.teardown()