Exemple #1
0
def build_table_attributes(table_design: dict) -> List[str]:
    """
    Return the attributes from the table design so that they can be inserted into a SQL DDL statement.

    >>> build_table_attributes({})  # no-op
    []
    >>> build_table_attributes({"attributes": {"distribution": "even"}})
    ['DISTSTYLE EVEN']
    >>> build_table_attributes({"attributes": {"distribution": ["key"], "compound_sort": ["name"]}})
    ['DISTSTYLE KEY', 'DISTKEY ( "key" )', 'COMPOUND SORTKEY ( "name" )']
    """
    table_attributes = table_design.get("attributes", {})
    distribution = table_attributes.get("distribution", [])
    compound_sort = table_attributes.get("compound_sort", [])
    interleaved_sort = table_attributes.get("interleaved_sort", [])

    ddl_attributes = []
    # TODO Use for staging tables: ddl_attributes.append("BACKUP NO")
    if distribution:
        if isinstance(distribution, list):
            ddl_attributes.append("DISTSTYLE KEY")
            ddl_attributes.append("DISTKEY ( {} )".format(
                join_column_list(distribution)))
        elif distribution == "all":
            ddl_attributes.append("DISTSTYLE ALL")
        elif distribution == "even":
            ddl_attributes.append("DISTSTYLE EVEN")
    if compound_sort:
        ddl_attributes.append("COMPOUND SORTKEY ( {} )".format(
            join_column_list(compound_sort)))
    elif interleaved_sort:
        ddl_attributes.append("INTERLEAVED SORTKEY ( {} )".format(
            join_column_list(interleaved_sort)))
    return ddl_attributes
Exemple #2
0
def insert_from_query(conn: connection,
                      table_name: TableName,
                      column_list: List[str],
                      query_stmt: str,
                      dry_run=False) -> None:
    """
    Load data into table in the data warehouse using the INSERT INTO command.
    """
    stmt = """
        INSERT INTO {table} (
            {columns}
        )
        {query}
        """.format(table=table_name,
                   columns=join_column_list(column_list),
                   query=query_stmt)

    if dry_run:
        logger.info("Dry-run: Skipping inserting data into '%s' from query",
                    table_name.identifier)
        etl.db.skip_query(conn, stmt)
    else:
        logger.info("Inserting data into '%s' from query",
                    table_name.identifier)
        try:
            etl.db.execute(conn, stmt)
        except psycopg2.InternalError as exc:
            if "S3 Query Exception" in exc.pgerror or "S3Query Exception" in exc.pgerror:
                # If this error was caused by a table in S3 (see Redshift Spectrum) then we might be able to try again.
                raise TransientETLError(exc) from exc
            else:
                logger.warning(
                    "SQL Error is not S3 Query Exception, cannot retry: %s",
                    exc.pgerror)
                raise
Exemple #3
0
def copy_using_manifest(conn: connection,
                        table_name: TableName,
                        column_list: List[str],
                        s3_uri: str,
                        aws_iam_role: str,
                        need_compupdate=False,
                        dry_run=False) -> None:
    credentials = "aws_iam_role={}".format(aws_iam_role)

    copy_stmt = """
        COPY {table} (
            {columns}
        )
        FROM %s
        CREDENTIALS %s MANIFEST
        DELIMITER ',' ESCAPE REMOVEQUOTES GZIP
        TIMEFORMAT AS 'auto' DATEFORMAT AS 'auto'
        TRUNCATECOLUMNS
        STATUPDATE OFF
        COMPUPDATE {compupdate}
        """.format(table=table_name,
                   columns=join_column_list(column_list),
                   compupdate="ON" if need_compupdate else "OFF")
    if dry_run:
        logger.info("Dry-run: Skipping copying data into '%s' using '%s'",
                    table_name.identifier, s3_uri)
        etl.db.skip_query(conn, copy_stmt, (s3_uri, credentials))
    else:
        logger.info("Copying data into '%s' using '%s'", table_name.identifier,
                    s3_uri)
        try:
            with log_load_error(conn):
                etl.db.execute(conn, copy_stmt, (s3_uri, credentials))
        except psycopg2.InternalError as exc:
            raise TransientETLError(exc) from exc
Exemple #4
0
def build_table_constraints(table_design: dict) -> List[str]:
    """
    Return the constraints from the table design so that they can be inserted into a SQL DDL statement.

    >>> build_table_constraints({})  # no-op
    []
    >>> build_table_constraints({"constraints": [{"primary_key": ["id"]}, {"unique": ["name", "email"]}]})
    ['PRIMARY KEY ( "id" )', 'UNIQUE ( "name", "email" )']
    """
    table_constraints = table_design.get("constraints", [])
    type_lookup = dict([("primary_key", "PRIMARY KEY"),
                        ("surrogate_key", "PRIMARY KEY"), ("unique", "UNIQUE"),
                        ("natural_key", "UNIQUE")])
    ddl_for_constraints = []
    for constraint in table_constraints:
        [[constraint_type, column_list]] = constraint.items()
        ddl_for_constraints.append("{} ( {} )".format(
            type_lookup[constraint_type], join_column_list(column_list)))
    return ddl_for_constraints
Exemple #5
0
def copy_from_uri(conn: connection,
                  table_name: TableName,
                  column_list: List[str],
                  s3_uri: str,
                  aws_iam_role: str,
                  need_compupdate=False,
                  dry_run=False) -> None:
    """
    Load data into table in the data warehouse using the COPY command.
    """
    credentials = "aws_iam_role={}".format(aws_iam_role)

    stmt = """
        COPY {table} (
            {columns}
        )
        FROM %s
        CREDENTIALS %s MANIFEST
        DELIMITER ',' ESCAPE REMOVEQUOTES GZIP
        TIMEFORMAT AS 'auto' DATEFORMAT AS 'auto'
        TRUNCATECOLUMNS
        STATUPDATE OFF
        COMPUPDATE {compupdate}
        """.format(table=table_name,
                   columns=join_column_list(column_list),
                   compupdate="ON" if need_compupdate else "OFF")
    if dry_run:
        logger.info("Dry-run: Skipping copying data into '%s' from '%s'",
                    table_name.identifier, s3_uri)
        etl.db.skip_query(conn, stmt, (s3_uri, credentials))
    else:
        logger.info("Copying data into '%s' from '%s'", table_name.identifier,
                    s3_uri)
        try:
            with log_load_error(conn):
                etl.db.execute(conn, stmt, (s3_uri, credentials))
            row_count = etl.db.query(conn, "SELECT pg_last_copy_count()")
            logger.info("Copied %d rows into '%s'", row_count[0][0],
                        table_name.identifier)
        except psycopg2.InternalError as exc:
            raise TransientETLError(exc) from exc