def create_load_redshift_runner(): parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--max_error', dest='max_error', default=0, type=int) parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', default=None) parser.add_argument('--no_escape', action='store_true', default=False) parser.add_argument('--gzip', action='store_true', default=False) parser.add_argument('--command_options', dest='command_options', default=None) parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') parser.add_argument('--force_drop_table', dest='force_drop_table', default=False) script_arguments = parser.parse_args() print script_arguments table = Table(SqlStatement(script_arguments.table_definition)) connection = redshift_connection( cursor_factory=psycopg2.extras.RealDictCursor) table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) else: columns = sorted( [column.column_name.lower() for column in table.columns()]) redshift_table_columns = get_redshift_table_colunms(table, cursor) if columns != redshift_table_columns: error_string = ( "Table schema mismatch: {table}\n" "Columns for new table: {columns}\n" "Columns for existing table: {redshift_table_columns}").format( table=table.full_name, columns=", ".join(columns), redshift_table_columns=", ".join(redshift_table_columns)) raise Exception(error_string) # Load data into redshift load_query = load_redshift(table, script_arguments.input_paths, script_arguments.max_error, script_arguments.replace_invalid_char, script_arguments.no_escape, script_arguments.gzip, script_arguments.command_options) try: cursor.execute(load_query) cursor.execute('COMMIT') except Exception as error: error_query = create_error_retrieval_query( script_arguments.input_paths) cursor.execute(error_query) separator = "-" * 50 + "\n" stderr.write( "Error while loading data into redshift \n\n{}".format(separator)) for item in cursor.fetchall(): for key in item: stderr.write("{}: {}\n".format(key, str(item[key]).strip())) stderr.write(separator) raise error cursor.close() connection.close()
def create_load_redshift_runner(): parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--max_error', dest='max_error', default=0, type=int) parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', default=None) parser.add_argument('--no_escape', action='store_true', default=False) parser.add_argument('--gzip', action='store_true', default=False) parser.add_argument('--command_options', dest='command_options', default=None) parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') parser.add_argument('--force_drop_table', dest='force_drop_table', default=False) script_arguments = parser.parse_args() print script_arguments table = Table(SqlStatement(script_arguments.table_definition)) connection = redshift_connection( cursor_factory=psycopg2.extras.RealDictCursor) table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) else: columns = sorted( [column.column_name.lower() for column in table.columns()]) redshift_table_columns = get_redshift_table_colunms(table, cursor) if columns != redshift_table_columns: error_string = ( "Table schema mismatch: {table}\n" "Columns for existing table: {columns}\n" "Columns for new table: {redshift_table_columns}").format( table=table.full_name, columns=", ".join(columns), redshift_table_columns=", ".join(redshift_table_columns)) raise Exception(error_string) # Load data into redshift load_query = load_redshift( table, script_arguments.input_paths, script_arguments.max_error, script_arguments.replace_invalid_char, script_arguments.no_escape, script_arguments.gzip, script_arguments.command_options) try: cursor.execute(load_query) cursor.execute('COMMIT') except Exception as error: error_query = create_error_retrieval_query( script_arguments.input_paths) cursor.execute(error_query) separator = "-" * 50 + "\n" stderr.write( "Error while loading data into redshift \n\n{}".format(separator)) for item in cursor.fetchall(): for key in item: stderr.write("{}: {}\n".format(key, str(item[key]).strip())) stderr.write(separator) raise error cursor.close() connection.close()