コード例 #1
0
ファイル: publish_redshift.py プロジェクト: schafrn/s3parq
def create_table(table_name: str, schema_name: str, columns: dict,
                 partitions: dict, path: str, session_helper: SessionHelper):
    """
    Creates a table in AWS redshift. The table will be named schema_name.table_name and belong to the (existing) Redshift db db_name.
    Args:
        table_name: name of created table. NOTE: THIS WILL ERROR IF table_name ALREADY EXISTS IN REDSHIFT
        schema_name: name of schema in redshift. Schema must be external and already exist!
        columns: Dictionary with keys corresponding to column names and values corresponding to pandas dtypes, excluding partition columns.
        partitions: Dict similar to columns, except ONLY with partition columns
        path: Path to published contract in s3 (excluding partitions)
        session_helper: Instance of Redshift s3parq.session_helper
    """
    _redshift_name_validator(table_name)
    redshift_columns = _datatype_mapper(columns)
    redshift_partitions = _datatype_mapper(partitions)
    with session_helper.db_session_scope() as scope:
        if_exists_query = f'SELECT EXISTS(SELECT schemaname, tablename FROM SVV_EXTERNAL_TABLES WHERE tablename=\'{table_name}\' AND schemaname=\'{schema_name}\');'
        table_exists = scope.execute(if_exists_query).first()[0]
        if table_exists: return

        if not partitions:
            new_schema_query = (
                f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \
                STORED AS PARQUET \
                LOCATION \'{path}\';')
        else:
            new_schema_query = (
                f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \
                PARTITIONED BY {redshift_partitions} STORED AS PARQUET \
                LOCATION \'{path}\';')
        logger.info(f'Running query to create table: {new_schema_query}')
        scope.execute(new_schema_query)
コード例 #2
0
ファイル: publish_redshift.py プロジェクト: schafrn/s3parq
def create_partitions(bucket: str, schema: str, table: str, filepath: str,
                      session_helper: SessionHelper) -> str:
    '''
    Generates partitioning SQL
    Args:
        bucket (str): S3 bucket where data is stored
        schema (str): name of redshift schema (must already exist)
        table (str): name of table in schema.  Must have partitions scoped out in `CREATE TABLE ...`
        filepath (str): path to data in S3 that will be queryable by it's partitions
    ----
    Returns:
        query (str): a raw SQL string that adds the partitioned data to the table
    --------
    Example:
        Args:
            bucket = 'MyBucket'
            schema = 'MySchema'
            table = 'MyTable'
            filepath = 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet'
        Returns:
            "ALTER TABLE MySchema.MyTable               ADD PARTITION (apple='abcd' ,banana='1234')               LOCATION 's3://MyBucket/path/to/data/apple=abcd';"
    '''
    partitions = _get_partitions_for_spectrum(filepath)
    formatted_partitions = _format_partition_strings_for_sql(partitions)
    path_to_data = _get_partition_location(filepath)

    with session_helper.db_session_scope() as scope:
        partitions_query = f"ALTER TABLE {schema}.{table} \
            ADD IF NOT EXISTS PARTITION ({', '.join(formatted_partitions)}) \
            LOCATION 's3://{bucket}/{path_to_data}';"

        logger.info(f'Running query to create: {partitions_query}')
        scope.execute(partitions_query)
コード例 #3
0
def create_partitions(bucket: str, schema: str, table: str, filepath: str, session_helper: SessionHelper) -> None:
    ''' Executes the SQL that creates partitions on the given table for an 
    individual file

    Args:
        bucket (str): S3 bucket where data is stored
        schema (str): name of redshift schema (must already exist)
        table (str): name of table in schema.  Must have partitions scoped out in `CREATE TABLE ...`
        filepath (str): path to data in S3 that will be queryable by it's partitions
            NOTE: This is to the single parquet file, including the partitions
        session_helper (SessionHelper): a configured s3parq.session_helper.SessionHelper session

    Returns:
        None

    Example:
        Args:
            bucket = 'MyBucket'
            schema = 'MySchema'
            table = 'MyTable'
            filepath = 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet'
            session_helper = some_configured_session
    '''
    partitions = _get_partitions_for_spectrum(filepath)
    formatted_partitions = _format_partition_strings_for_sql(partitions)
    path_to_data = _get_partition_location(filepath)

    with session_helper.db_session_scope() as scope:
        partitions_query = f"ALTER TABLE {schema}.{table} \
            ADD IF NOT EXISTS PARTITION ({', '.join(formatted_partitions)}) \
            LOCATION 's3://{bucket}/{path_to_data}';"
        logger.info(f'Running query to create: {partitions_query}')
        scope.execute(partitions_query)
コード例 #4
0
def create_custom_table(table_name: str, schema_name: str, partitions: dict, path: str, custom_redshift_columns: dict, session_helper: SessionHelper) -> None:
    """ Creates a table in AWS redshift. The table will be named 
    schema_name.table_name and belong to the (existing) Redshift db db_name.
    The created table will use the CUSTOM redshift column data types defined 
    in custom_redshift_columns.

    Args:
        table_name (str): name of created table
            NOTE: THIS WILL ERROR IF table_name ALREADY EXISTS IN REDSHIFT
        schema_name (str): name of schema in redshift; Schema must be external 
            and already exist!
        partitions (dict): Dict similar to columns, except ONLY with partition columns
        path (str): Path to published dataset in s3 (excluding partitions)
        custom_redshift_columns (dict): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)
        session_helper (SessionHelper): Instance of Redshift s3parq.session_helper
    """

    logger.info("Running create_custom_table...")

    _redshift_name_validator(table_name)

    logger.info("Generating create columns sql statement with custom redshift columns...")
    logger.info("Generating create partitions sql statement with custom redshift columns...")
    redshift_columns_sql = ""
    redshift_partitions_sql = ""
    for k, v in custom_redshift_columns.items():
        if k in partitions:
            redshift_partitions_sql += f'{k} {v}, '
        else:
            redshift_columns_sql += f'{k} {v}, '
    redshift_columns = f"({redshift_columns_sql[:-2]})"  # Slice off the last space and comma
    redshift_partitions = f"({redshift_partitions_sql[:-2]})"  # Slice off the last space and comma

    with session_helper.db_session_scope() as scope:
        if_exists_query = f'SELECT EXISTS(SELECT schemaname, tablename FROM SVV_EXTERNAL_TABLES WHERE tablename=\'{table_name}\' AND schemaname=\'{schema_name}\');'
        table_exists = scope.execute(if_exists_query).first()[0]
        if table_exists:
            return

        if not partitions:
            new_schema_query = (
                f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \
                STORED AS PARQUET \
                LOCATION \'{path}\';'
            )
        else:
            new_schema_query = (
                f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \
                PARTITIONED BY {redshift_partitions} STORED AS PARQUET \
                LOCATION \'{path}\';'
            )
        logger.info(f'Running query to create table: {new_schema_query}')
        scope.execute(new_schema_query)
コード例 #5
0
ファイル: publish_redshift.py プロジェクト: schafrn/s3parq
def create_schema(schema_name: str, db_name: str, iam_role: str,
                  session_helper: SessionHelper):
    """Creates a schema in AWS redshift using a given iam_role. The schema is named schema_name and belongs to the (existing) Redshift db db_name.
        iam_role is a link to an existing AWS IAM Role with Redshift Spectrum write permissions."""
    _redshift_name_validator(schema_name, db_name)
    with session_helper.db_session_scope() as scope:
        new_schema_query = f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \
                FROM DATA CATALOG \
                database '{db_name}' \
                iam_role '{iam_role}';"

        logger.info(f'Running query to create schema: {new_schema_query}')
        scope.execute(new_schema_query)
コード例 #6
0
def create_schema(schema_name: str, db_name: str, iam_role: str, session_helper: SessionHelper) -> None:
    """ Creates a schema in AWS redshift using a given iam_role

    Args:
        schema_name (str): Name of the schema to create in Redshift Spectrum
        db_name (str): (Existing) database that the schema should belong to
        iam_role (str): link to an existing AWS IAM Role with Redshift Spectrum 
            write permissions
        session_helper (str): Active and configured session_helper session to use
    """
    _redshift_name_validator(schema_name, db_name)
    with session_helper.db_session_scope() as scope:
        new_schema_query = f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \
                FROM DATA CATALOG \
                database '{db_name}' \
                iam_role '{iam_role}';"

        logger.info(f'Running query to create schema: {new_schema_query}')
        scope.execute(new_schema_query)
コード例 #7
0
def create_schema(schema_name: str, session_helper: SessionHelper):
    schema_name_validator(schema_name)
    with session_helper.db_session_scope() as scope:
        scope.execute(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')