def create_table(table_name: str, schema_name: str, columns: dict, partitions: dict, path: str, session_helper: SessionHelper): """ Creates a table in AWS redshift. The table will be named schema_name.table_name and belong to the (existing) Redshift db db_name. Args: table_name: name of created table. NOTE: THIS WILL ERROR IF table_name ALREADY EXISTS IN REDSHIFT schema_name: name of schema in redshift. Schema must be external and already exist! columns: Dictionary with keys corresponding to column names and values corresponding to pandas dtypes, excluding partition columns. partitions: Dict similar to columns, except ONLY with partition columns path: Path to published contract in s3 (excluding partitions) session_helper: Instance of Redshift s3parq.session_helper """ _redshift_name_validator(table_name) redshift_columns = _datatype_mapper(columns) redshift_partitions = _datatype_mapper(partitions) with session_helper.db_session_scope() as scope: if_exists_query = f'SELECT EXISTS(SELECT schemaname, tablename FROM SVV_EXTERNAL_TABLES WHERE tablename=\'{table_name}\' AND schemaname=\'{schema_name}\');' table_exists = scope.execute(if_exists_query).first()[0] if table_exists: return if not partitions: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ STORED AS PARQUET \ LOCATION \'{path}\';') else: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ PARTITIONED BY {redshift_partitions} STORED AS PARQUET \ LOCATION \'{path}\';') logger.info(f'Running query to create table: {new_schema_query}') scope.execute(new_schema_query)
def create_partitions(bucket: str, schema: str, table: str, filepath: str, session_helper: SessionHelper) -> str: ''' Generates partitioning SQL Args: bucket (str): S3 bucket where data is stored schema (str): name of redshift schema (must already exist) table (str): name of table in schema. Must have partitions scoped out in `CREATE TABLE ...` filepath (str): path to data in S3 that will be queryable by it's partitions ---- Returns: query (str): a raw SQL string that adds the partitioned data to the table -------- Example: Args: bucket = 'MyBucket' schema = 'MySchema' table = 'MyTable' filepath = 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet' Returns: "ALTER TABLE MySchema.MyTable ADD PARTITION (apple='abcd' ,banana='1234') LOCATION 's3://MyBucket/path/to/data/apple=abcd';" ''' partitions = _get_partitions_for_spectrum(filepath) formatted_partitions = _format_partition_strings_for_sql(partitions) path_to_data = _get_partition_location(filepath) with session_helper.db_session_scope() as scope: partitions_query = f"ALTER TABLE {schema}.{table} \ ADD IF NOT EXISTS PARTITION ({', '.join(formatted_partitions)}) \ LOCATION 's3://{bucket}/{path_to_data}';" logger.info(f'Running query to create: {partitions_query}') scope.execute(partitions_query)
def create_partitions(bucket: str, schema: str, table: str, filepath: str, session_helper: SessionHelper) -> None: ''' Executes the SQL that creates partitions on the given table for an individual file Args: bucket (str): S3 bucket where data is stored schema (str): name of redshift schema (must already exist) table (str): name of table in schema. Must have partitions scoped out in `CREATE TABLE ...` filepath (str): path to data in S3 that will be queryable by it's partitions NOTE: This is to the single parquet file, including the partitions session_helper (SessionHelper): a configured s3parq.session_helper.SessionHelper session Returns: None Example: Args: bucket = 'MyBucket' schema = 'MySchema' table = 'MyTable' filepath = 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet' session_helper = some_configured_session ''' partitions = _get_partitions_for_spectrum(filepath) formatted_partitions = _format_partition_strings_for_sql(partitions) path_to_data = _get_partition_location(filepath) with session_helper.db_session_scope() as scope: partitions_query = f"ALTER TABLE {schema}.{table} \ ADD IF NOT EXISTS PARTITION ({', '.join(formatted_partitions)}) \ LOCATION 's3://{bucket}/{path_to_data}';" logger.info(f'Running query to create: {partitions_query}') scope.execute(partitions_query)
def create_custom_table(table_name: str, schema_name: str, partitions: dict, path: str, custom_redshift_columns: dict, session_helper: SessionHelper) -> None: """ Creates a table in AWS redshift. The table will be named schema_name.table_name and belong to the (existing) Redshift db db_name. The created table will use the CUSTOM redshift column data types defined in custom_redshift_columns. Args: table_name (str): name of created table NOTE: THIS WILL ERROR IF table_name ALREADY EXISTS IN REDSHIFT schema_name (str): name of schema in redshift; Schema must be external and already exist! partitions (dict): Dict similar to columns, except ONLY with partition columns path (str): Path to published dataset in s3 (excluding partitions) custom_redshift_columns (dict): This dictionary contains custom column data type definitions for redshift. The params should be formatted as follows: - column name (str) - data type (str) session_helper (SessionHelper): Instance of Redshift s3parq.session_helper """ logger.info("Running create_custom_table...") _redshift_name_validator(table_name) logger.info("Generating create columns sql statement with custom redshift columns...") logger.info("Generating create partitions sql statement with custom redshift columns...") redshift_columns_sql = "" redshift_partitions_sql = "" for k, v in custom_redshift_columns.items(): if k in partitions: redshift_partitions_sql += f'{k} {v}, ' else: redshift_columns_sql += f'{k} {v}, ' redshift_columns = f"({redshift_columns_sql[:-2]})" # Slice off the last space and comma redshift_partitions = f"({redshift_partitions_sql[:-2]})" # Slice off the last space and comma with session_helper.db_session_scope() as scope: if_exists_query = f'SELECT EXISTS(SELECT schemaname, tablename FROM SVV_EXTERNAL_TABLES WHERE tablename=\'{table_name}\' AND schemaname=\'{schema_name}\');' table_exists = scope.execute(if_exists_query).first()[0] if table_exists: return if not partitions: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ STORED AS PARQUET \ LOCATION \'{path}\';' ) else: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ PARTITIONED BY {redshift_partitions} STORED AS PARQUET \ LOCATION \'{path}\';' ) logger.info(f'Running query to create table: {new_schema_query}') scope.execute(new_schema_query)
def create_schema(schema_name: str, db_name: str, iam_role: str, session_helper: SessionHelper): """Creates a schema in AWS redshift using a given iam_role. The schema is named schema_name and belongs to the (existing) Redshift db db_name. iam_role is a link to an existing AWS IAM Role with Redshift Spectrum write permissions.""" _redshift_name_validator(schema_name, db_name) with session_helper.db_session_scope() as scope: new_schema_query = f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \ FROM DATA CATALOG \ database '{db_name}' \ iam_role '{iam_role}';" logger.info(f'Running query to create schema: {new_schema_query}') scope.execute(new_schema_query)
def create_schema(schema_name: str, db_name: str, iam_role: str, session_helper: SessionHelper) -> None: """ Creates a schema in AWS redshift using a given iam_role Args: schema_name (str): Name of the schema to create in Redshift Spectrum db_name (str): (Existing) database that the schema should belong to iam_role (str): link to an existing AWS IAM Role with Redshift Spectrum write permissions session_helper (str): Active and configured session_helper session to use """ _redshift_name_validator(schema_name, db_name) with session_helper.db_session_scope() as scope: new_schema_query = f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \ FROM DATA CATALOG \ database '{db_name}' \ iam_role '{iam_role}';" logger.info(f'Running query to create schema: {new_schema_query}') scope.execute(new_schema_query)
def create_schema(schema_name: str, session_helper: SessionHelper): schema_name_validator(schema_name) with session_helper.db_session_scope() as scope: scope.execute(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')