def main(): """Main Function """ parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--max_error', dest='max_error', default=0, type=int) parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', default=None) parser.add_argument('--no_escape', action='store_true', default=False) parser.add_argument('--gzip', action='store_true', default=False) parser.add_argument('--command_options', dest='command_options', default=None) parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') args = parser.parse_args() print args table = Table(SqlStatement(args.table_definition)) connection = redshift_connection() table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) # Load data into redshift load_query = load_redshift(table, args.input_paths, args.max_error, args.replace_invalid_char, args.no_escape, args.gzip, args.command_options) cursor.execute(load_query) cursor.execute('COMMIT') cursor.close() connection.close()
def main(): """Main function """ parser = argparse.ArgumentParser() parser.add_argument('--table', dest='table', required=True) parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default="Check Primary Key") parser.add_argument('--log_to_s3', action='store_true', default=False) parser.add_argument('--path_suffix', dest='path_suffix', default=None) args = parser.parse_args() connection = redshift_connection() table = Table(SqlScript(args.table)) result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) check = PrimaryKeyCheck(len(result), name=args.test_name, sns_topic_arn=args.sns_topic_arn) check.publish(args.log_to_s3, table=table.full_name, path_suffix=args.path_suffix) connection.close()
def sql_runner(): """Main Function """ parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--sql', dest='sql', required=True) parser.add_argument('--analyze', action='store_true', default=False) parser.add_argument('--non_transactional', action='store_true', default=False) args, sql_arguments = parser.parse_known_args() print args, sql_arguments sql_query = args.sql if sql_query.startswith('s3://'): sql_query = S3File(s3_path=S3Path(uri=args.sql)).text table = Table(SqlStatement(args.table_definition)) connection = redshift_connection() # Enable autocommit for non transactional sql execution if args.non_transactional: connection.autocommit = True else: # connection by default sets autocommit to True, but for # the SQL runner, it should be False to put all SQLs into one # transaction. connection.autocommit = False table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) # Load data into redshift with upsert query # If there are sql_arguments, place them along with the query # Otherwise, don't include them to avoid having to use %% everytime if len(sql_arguments) >= 1: print cursor.mogrify(sql_query, tuple(sql_arguments)) cursor.execute(sql_query, tuple(sql_arguments)) else: print sql_query cursor.execute(sql_query) cursor.execute('COMMIT') # Analyze the table if args.analyze: cursor.execute(table.analyze_script().sql()) cursor.close() connection.close()
def main(): """Main Function """ parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--sql', dest='sql', required=True) parser.add_argument('--analyze', action='store_true', default=False) parser.add_argument('--non_transactional', action='store_true', default=False) args, sql_arguments = parser.parse_known_args() print args, sql_arguments table = Table(SqlStatement(args.table_definition)) connection = redshift_connection() # Enable autocommit for non transactional sql execution if args.non_transactional: connection.autocommit = True table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) # Load data into redshift with upsert query # If there are sql_arguments, place them along with the query # Otherwise, don't include them to avoid having to use %% everytime if len(sql_arguments) >= 1: print cursor.mogrify(args.sql, tuple(sql_arguments)) cursor.execute(args.sql, tuple(sql_arguments)) else: print args.sql cursor.execute(args.sql) cursor.execute('COMMIT') # Analyze the table if args.analyze: cursor.execute(table.analyze_script().sql()) cursor.close() connection.close()
def primary_key_check(): parser = argparse.ArgumentParser() parser.add_argument('--table', dest='table', required=True) parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default="Check Primary Key") parser.add_argument('--log_to_s3', action='store_true', default=False) parser.add_argument('--path_suffix', dest='path_suffix', default=None) args = parser.parse_args() connection = redshift_connection() table = Table(SqlScript(args.table)) result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) check = PrimaryKeyCheck(len(result), name=args.test_name, sns_topic_arn=args.sns_topic_arn) check.publish(args.log_to_s3, table=table.full_name, path_suffix=args.path_suffix) connection.close()
def create_load_redshift_runner(): parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--max_error', dest='max_error', default=0, type=int) parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', default=None) parser.add_argument('--no_escape', action='store_true', default=False) parser.add_argument('--gzip', action='store_true', default=False) parser.add_argument('--command_options', dest='command_options', default=None) parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') parser.add_argument('--force_drop_table', dest='force_drop_table', default=False) script_arguments = parser.parse_args() print script_arguments table = Table(SqlStatement(script_arguments.table_definition)) connection = redshift_connection( cursor_factory=psycopg2.extras.RealDictCursor) table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) else: columns = sorted( [column.column_name.lower() for column in table.columns()]) redshift_table_columns = get_redshift_table_colunms(table, cursor) if columns != redshift_table_columns: error_string = ( "Table schema mismatch: {table}\n" "Columns for new table: {columns}\n" "Columns for existing table: {redshift_table_columns}").format( table=table.full_name, columns=", ".join(columns), redshift_table_columns=", ".join(redshift_table_columns)) raise Exception(error_string) # Load data into redshift load_query = load_redshift(table, script_arguments.input_paths, script_arguments.max_error, script_arguments.replace_invalid_char, script_arguments.no_escape, script_arguments.gzip, script_arguments.command_options) try: cursor.execute(load_query) cursor.execute('COMMIT') except Exception as error: error_query = create_error_retrieval_query( script_arguments.input_paths) cursor.execute(error_query) separator = "-" * 50 + "\n" stderr.write( "Error while loading data into redshift \n\n{}".format(separator)) for item in cursor.fetchall(): for key in item: stderr.write("{}: {}\n".format(key, str(item[key]).strip())) stderr.write(separator) raise error cursor.close() connection.close()
def create_load_redshift_runner(): parser = argparse.ArgumentParser() parser.add_argument('--table_definition', dest='table_definition', required=True) parser.add_argument('--max_error', dest='max_error', default=0, type=int) parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', default=None) parser.add_argument('--no_escape', action='store_true', default=False) parser.add_argument('--gzip', action='store_true', default=False) parser.add_argument('--command_options', dest='command_options', default=None) parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') parser.add_argument('--force_drop_table', dest='force_drop_table', default=False) script_arguments = parser.parse_args() print script_arguments table = Table(SqlStatement(script_arguments.table_definition)) connection = redshift_connection( cursor_factory=psycopg2.extras.RealDictCursor) table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition if table_not_exists: cursor.execute(table.create_script().sql()) else: columns = sorted( [column.column_name.lower() for column in table.columns()]) redshift_table_columns = get_redshift_table_colunms(table, cursor) if columns != redshift_table_columns: error_string = ( "Table schema mismatch: {table}\n" "Columns for existing table: {columns}\n" "Columns for new table: {redshift_table_columns}").format( table=table.full_name, columns=", ".join(columns), redshift_table_columns=", ".join(redshift_table_columns)) raise Exception(error_string) # Load data into redshift load_query = load_redshift( table, script_arguments.input_paths, script_arguments.max_error, script_arguments.replace_invalid_char, script_arguments.no_escape, script_arguments.gzip, script_arguments.command_options) try: cursor.execute(load_query) cursor.execute('COMMIT') except Exception as error: error_query = create_error_retrieval_query( script_arguments.input_paths) cursor.execute(error_query) separator = "-" * 50 + "\n" stderr.write( "Error while loading data into redshift \n\n{}".format(separator)) for item in cursor.fetchall(): for key in item: stderr.write("{}: {}\n".format(key, str(item[key]).strip())) stderr.write(separator) raise error cursor.close() connection.close()