def build_table_attributes(table_design: dict) -> List[str]: """ Return the attributes from the table design so that they can be inserted into a SQL DDL statement. >>> build_table_attributes({}) # no-op [] >>> build_table_attributes({"attributes": {"distribution": "even"}}) ['DISTSTYLE EVEN'] >>> build_table_attributes({"attributes": {"distribution": ["key"], "compound_sort": ["name"]}}) ['DISTSTYLE KEY', 'DISTKEY ( "key" )', 'COMPOUND SORTKEY ( "name" )'] """ table_attributes = table_design.get("attributes", {}) distribution = table_attributes.get("distribution", []) compound_sort = table_attributes.get("compound_sort", []) interleaved_sort = table_attributes.get("interleaved_sort", []) ddl_attributes = [] # TODO Use for staging tables: ddl_attributes.append("BACKUP NO") if distribution: if isinstance(distribution, list): ddl_attributes.append("DISTSTYLE KEY") ddl_attributes.append("DISTKEY ( {} )".format( join_column_list(distribution))) elif distribution == "all": ddl_attributes.append("DISTSTYLE ALL") elif distribution == "even": ddl_attributes.append("DISTSTYLE EVEN") if compound_sort: ddl_attributes.append("COMPOUND SORTKEY ( {} )".format( join_column_list(compound_sort))) elif interleaved_sort: ddl_attributes.append("INTERLEAVED SORTKEY ( {} )".format( join_column_list(interleaved_sort))) return ddl_attributes
def insert_from_query(conn: connection, table_name: TableName, column_list: List[str], query_stmt: str, dry_run=False) -> None: """ Load data into table in the data warehouse using the INSERT INTO command. """ stmt = """ INSERT INTO {table} ( {columns} ) {query} """.format(table=table_name, columns=join_column_list(column_list), query=query_stmt) if dry_run: logger.info("Dry-run: Skipping inserting data into '%s' from query", table_name.identifier) etl.db.skip_query(conn, stmt) else: logger.info("Inserting data into '%s' from query", table_name.identifier) try: etl.db.execute(conn, stmt) except psycopg2.InternalError as exc: if "S3 Query Exception" in exc.pgerror or "S3Query Exception" in exc.pgerror: # If this error was caused by a table in S3 (see Redshift Spectrum) then we might be able to try again. raise TransientETLError(exc) from exc else: logger.warning( "SQL Error is not S3 Query Exception, cannot retry: %s", exc.pgerror) raise
def copy_using_manifest(conn: connection, table_name: TableName, column_list: List[str], s3_uri: str, aws_iam_role: str, need_compupdate=False, dry_run=False) -> None: credentials = "aws_iam_role={}".format(aws_iam_role) copy_stmt = """ COPY {table} ( {columns} ) FROM %s CREDENTIALS %s MANIFEST DELIMITER ',' ESCAPE REMOVEQUOTES GZIP TIMEFORMAT AS 'auto' DATEFORMAT AS 'auto' TRUNCATECOLUMNS STATUPDATE OFF COMPUPDATE {compupdate} """.format(table=table_name, columns=join_column_list(column_list), compupdate="ON" if need_compupdate else "OFF") if dry_run: logger.info("Dry-run: Skipping copying data into '%s' using '%s'", table_name.identifier, s3_uri) etl.db.skip_query(conn, copy_stmt, (s3_uri, credentials)) else: logger.info("Copying data into '%s' using '%s'", table_name.identifier, s3_uri) try: with log_load_error(conn): etl.db.execute(conn, copy_stmt, (s3_uri, credentials)) except psycopg2.InternalError as exc: raise TransientETLError(exc) from exc
def build_table_constraints(table_design: dict) -> List[str]: """ Return the constraints from the table design so that they can be inserted into a SQL DDL statement. >>> build_table_constraints({}) # no-op [] >>> build_table_constraints({"constraints": [{"primary_key": ["id"]}, {"unique": ["name", "email"]}]}) ['PRIMARY KEY ( "id" )', 'UNIQUE ( "name", "email" )'] """ table_constraints = table_design.get("constraints", []) type_lookup = dict([("primary_key", "PRIMARY KEY"), ("surrogate_key", "PRIMARY KEY"), ("unique", "UNIQUE"), ("natural_key", "UNIQUE")]) ddl_for_constraints = [] for constraint in table_constraints: [[constraint_type, column_list]] = constraint.items() ddl_for_constraints.append("{} ( {} )".format( type_lookup[constraint_type], join_column_list(column_list))) return ddl_for_constraints
def copy_from_uri(conn: connection, table_name: TableName, column_list: List[str], s3_uri: str, aws_iam_role: str, need_compupdate=False, dry_run=False) -> None: """ Load data into table in the data warehouse using the COPY command. """ credentials = "aws_iam_role={}".format(aws_iam_role) stmt = """ COPY {table} ( {columns} ) FROM %s CREDENTIALS %s MANIFEST DELIMITER ',' ESCAPE REMOVEQUOTES GZIP TIMEFORMAT AS 'auto' DATEFORMAT AS 'auto' TRUNCATECOLUMNS STATUPDATE OFF COMPUPDATE {compupdate} """.format(table=table_name, columns=join_column_list(column_list), compupdate="ON" if need_compupdate else "OFF") if dry_run: logger.info("Dry-run: Skipping copying data into '%s' from '%s'", table_name.identifier, s3_uri) etl.db.skip_query(conn, stmt, (s3_uri, credentials)) else: logger.info("Copying data into '%s' from '%s'", table_name.identifier, s3_uri) try: with log_load_error(conn): etl.db.execute(conn, stmt, (s3_uri, credentials)) row_count = etl.db.query(conn, "SELECT pg_last_copy_count()") logger.info("Copied %d rows into '%s'", row_count[0][0], table_name.identifier) except psycopg2.InternalError as exc: raise TransientETLError(exc) from exc