def _upload_entity_df( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, table_name: str, ): if isinstance(entity_df, pd.DataFrame): # If the entity_df is a pandas dataframe, upload it to Redshift aws_utils.upload_df_to_redshift( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, s3_resource, f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet", config.offline_store.iam_role, table_name, entity_df, ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Redshift table out of it aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"CREATE TABLE {table_name} AS ({entity_df})", ) else: raise InvalidEntityType(type(entity_df))
def to_redshift(self, table_name: str) -> None: """ Save dataset as a new Redshift table """ if self.on_demand_feature_views is not None: transformed_df = self.to_df() aws_utils.upload_df_to_redshift( self._redshift_client, self._config.offline_store.cluster_id, self._config.offline_store.database, self._config.offline_store.user, self._s3_resource, f"{self._config.offline_store.s3_staging_location}/features_df/{table_name}.parquet", self._config.offline_store.iam_role, table_name, transformed_df, ) return with self._query_generator() as query: query = f'CREATE TABLE "{table_name}" AS ({query});\n' if self._drop_columns is not None: for column in self._drop_columns: query += f"ALTER TABLE {table_name} DROP COLUMN {column};\n" aws_utils.execute_redshift_statement( self._redshift_client, self._config.offline_store.cluster_id, self._config.offline_store.database, self._config.offline_store.user, query, )
def bootstrap(): # Bootstrap() will automatically be called from the init_repo() during `feast init` import pathlib from datetime import datetime, timedelta from feast.driver_test_data import create_driver_hourly_stats_df end_date = datetime.now().replace(microsecond=0, second=0, minute=0) start_date = end_date - timedelta(days=15) driver_entities = [1001, 1002, 1003, 1004, 1005] driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) aws_region = click.prompt("AWS Region (e.g. us-west-2)") cluster_id = click.prompt("Redshift Cluster ID") database = click.prompt("Redshift Database Name") user = click.prompt("Redshift User Name") s3_staging_location = click.prompt("Redshift S3 Staging Location (s3://*)") iam_role = click.prompt("Redshift IAM Role for S3 (arn:aws:iam::*:role/*)") if click.confirm( "Should I upload example data to Redshift (overwriting 'feast_driver_hourly_stats' table)?", default=True, ): client = aws_utils.get_redshift_data_client(aws_region) s3 = aws_utils.get_s3_resource(aws_region) aws_utils.execute_redshift_statement( client, cluster_id, database, user, "DROP TABLE IF EXISTS feast_driver_hourly_stats", ) aws_utils.upload_df_to_redshift( client, cluster_id, database, user, s3, f"{s3_staging_location}/data/feast_driver_hourly_stats.parquet", iam_role, "feast_driver_hourly_stats", driver_df, ) repo_path = pathlib.Path(__file__).parent.absolute() config_file = repo_path / "feature_store.yaml" replace_str_in_file(config_file, "%AWS_REGION%", aws_region) replace_str_in_file(config_file, "%REDSHIFT_CLUSTER_ID%", cluster_id) replace_str_in_file(config_file, "%REDSHIFT_DATABASE%", database) replace_str_in_file(config_file, "%REDSHIFT_USER%", user) replace_str_in_file(config_file, "%REDSHIFT_S3_STAGING_LOCATION%", s3_staging_location) replace_str_in_file(config_file, "%REDSHIFT_IAM_ROLE%", iam_role)
def teardown(self): for table in self.tables: aws_utils.execute_redshift_statement( self.client, self.offline_store_config.cluster_id, self.offline_store_config.database, self.offline_store_config.user, f"DROP TABLE IF EXISTS {table}", )
def query_generator() -> Iterator[str]: table_name = offline_utils.get_temp_entity_table_name() entity_schema = _upload_entity_df_and_get_entity_schema( entity_df, redshift_client, config, s3_resource, table_name) entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( entity_schema) expected_join_keys = offline_utils.get_expected_join_keys( project, feature_views, registry) offline_utils.assert_expected_columns_in_entity_df( entity_schema, expected_join_keys, entity_df_event_timestamp_col) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( entity_df, entity_df_event_timestamp_col, redshift_client, config, table_name, ) # Build a query context containing all information required to template the Redshift SQL query query_context = offline_utils.get_feature_view_query_context( feature_refs, feature_views, registry, project, entity_df_event_timestamp_range, ) # Generate the Redshift SQL query from the query context query = offline_utils.build_point_in_time_query( query_context, left_table_query_string=table_name, entity_df_event_timestamp_col=entity_df_event_timestamp_col, entity_df_columns=entity_schema.keys(), query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, full_feature_names=full_feature_names, ) try: yield query finally: # Always clean up the uploaded Redshift table aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"DROP TABLE IF EXISTS {table_name}", )
def to_redshift(self, table_name: str) -> None: """ Save dataset as a new Redshift table """ with self._query_generator() as query: query = f'CREATE TABLE "{table_name}" AS ({query});\n' if self._drop_columns is not None: for column in self._drop_columns: query += f"ALTER TABLE {table_name} DROP COLUMN {column};\n" aws_utils.execute_redshift_statement( self._redshift_client, self._config.offline_store.cluster_id, self._config.offline_store.database, self._config.offline_store.user, query, )
def _upload_entity_df_and_get_entity_schema( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, table_name: str, ) -> Dict[str, np.dtype]: if isinstance(entity_df, pd.DataFrame): # If the entity_df is a pandas dataframe, upload it to Redshift # and construct the schema from the original entity_df dataframe aws_utils.upload_df_to_redshift( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, s3_resource, f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet", config.offline_store.iam_role, table_name, entity_df, ) return dict(zip(entity_df.columns, entity_df.dtypes)) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Redshift table out of it, # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"CREATE TABLE {table_name} AS ({entity_df})", ) limited_entity_df = RedshiftRetrievalJob( f"SELECT * FROM {table_name} LIMIT 1", redshift_client, s3_resource, config, full_feature_names=False, on_demand_feature_views=None, ).to_df() return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df))
def get_table_column_names_and_types( self, config: RepoConfig) -> Iterable[Tuple[str, str]]: """ Returns a mapping of column names to types for this Redshift source. Args: config: A RepoConfig describing the feature repo """ from botocore.exceptions import ClientError from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig from feast.infra.utils import aws_utils assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) client = aws_utils.get_redshift_data_client( config.offline_store.region) if self.table is not None: try: table = client.describe_table( ClusterIdentifier=config.offline_store.cluster_id, Database=(self.database if self.database else config.offline_store.database), DbUser=config.offline_store.user, Table=self.table, Schema=self.schema, ) except ClientError as e: if e.response["Error"]["Code"] == "ValidationException": raise RedshiftCredentialsError() from e raise # The API returns valid JSON with empty column list when the table doesn't exist if len(table["ColumnList"]) == 0: raise DataSourceNotFoundException(self.table) columns = table["ColumnList"] else: statement_id = aws_utils.execute_redshift_statement( client, config.offline_store.cluster_id, self.database if self.database else config.offline_store.database, config.offline_store.user, f"SELECT * FROM ({self.query}) LIMIT 1", ) columns = aws_utils.get_redshift_statement_result( client, statement_id)["ColumnMetadata"] return [(column["name"], column["typeName"].upper()) for column in columns]
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, redshift_client, config: RepoConfig, table_name: str, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min(), entity_df_event_timestamp.max(), ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), determine range # from table statement_id = aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}", ) res = aws_utils.get_redshift_statement_result( redshift_client, statement_id)["Records"][0] entity_df_event_timestamp_range = ( parser.parse(res[0]["stringValue"]), parser.parse(res[1]["stringValue"]), ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def prep_redshift_fs_and_fv( source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") df = create_dataset() table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}" offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) aws_utils.upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, df, ) redshift_source = RedshiftSource( table=table_name if source_type == "table" else None, query=f"SELECT * FROM {table_name}" if source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(redshift_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db")), offline_store=offline_store, ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown() # Clean up the uploaded Redshift table aws_utils.execute_redshift_statement( client, offline_store.cluster_id, offline_store.database, offline_store.user, f"DROP TABLE {table_name}", )