Example #1
0
def create_load_redshift_runner():
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_definition',
                        dest='table_definition',
                        required=True)
    parser.add_argument('--max_error', dest='max_error', default=0, type=int)
    parser.add_argument('--replace_invalid_char',
                        dest='replace_invalid_char',
                        default=None)
    parser.add_argument('--no_escape', action='store_true', default=False)
    parser.add_argument('--gzip', action='store_true', default=False)
    parser.add_argument('--command_options',
                        dest='command_options',
                        default=None)
    parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+')
    parser.add_argument('--force_drop_table',
                        dest='force_drop_table',
                        default=False)
    script_arguments = parser.parse_args()
    print script_arguments

    table = Table(SqlStatement(script_arguments.table_definition))
    connection = redshift_connection(
        cursor_factory=psycopg2.extras.RealDictCursor)
    table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(),
                                      connection).loc[0][0]

    cursor = connection.cursor()
    # Create table in redshift, this is safe due to the if exists condition
    if table_not_exists:
        cursor.execute(table.create_script().sql())
    else:
        columns = sorted(
            [column.column_name.lower() for column in table.columns()])
        redshift_table_columns = get_redshift_table_colunms(table, cursor)

        if columns != redshift_table_columns:
            error_string = (
                "Table schema mismatch: {table}\n"
                "Columns for new table: {columns}\n"
                "Columns for existing table: {redshift_table_columns}").format(
                    table=table.full_name,
                    columns=", ".join(columns),
                    redshift_table_columns=", ".join(redshift_table_columns))
            raise Exception(error_string)

    # Load data into redshift
    load_query = load_redshift(table, script_arguments.input_paths,
                               script_arguments.max_error,
                               script_arguments.replace_invalid_char,
                               script_arguments.no_escape,
                               script_arguments.gzip,
                               script_arguments.command_options)
    try:
        cursor.execute(load_query)
        cursor.execute('COMMIT')
    except Exception as error:
        error_query = create_error_retrieval_query(
            script_arguments.input_paths)
        cursor.execute(error_query)
        separator = "-" * 50 + "\n"

        stderr.write(
            "Error while loading data into redshift \n\n{}".format(separator))

        for item in cursor.fetchall():
            for key in item:
                stderr.write("{}: {}\n".format(key, str(item[key]).strip()))
            stderr.write(separator)
        raise error

    cursor.close()
    connection.close()
def create_load_redshift_runner():
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_definition', dest='table_definition',
                        required=True)
    parser.add_argument('--max_error', dest='max_error', default=0, type=int)
    parser.add_argument('--replace_invalid_char', dest='replace_invalid_char',
                        default=None)
    parser.add_argument('--no_escape', action='store_true', default=False)
    parser.add_argument('--gzip', action='store_true', default=False)
    parser.add_argument('--command_options', dest='command_options',
                        default=None)
    parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+')
    parser.add_argument('--force_drop_table', dest='force_drop_table',
                        default=False)
    script_arguments = parser.parse_args()
    print script_arguments

    table = Table(SqlStatement(script_arguments.table_definition))
    connection = redshift_connection(
        cursor_factory=psycopg2.extras.RealDictCursor)
    table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(),
                                      connection).loc[0][0]

    cursor = connection.cursor()
    # Create table in redshift, this is safe due to the if exists condition
    if table_not_exists:
        cursor.execute(table.create_script().sql())
    else:
        columns = sorted(
            [column.column_name.lower() for column in table.columns()])
        redshift_table_columns = get_redshift_table_colunms(table, cursor)

        if columns != redshift_table_columns:
            error_string = (
                "Table schema mismatch: {table}\n"
                "Columns for existing table: {columns}\n"
                "Columns for new table: {redshift_table_columns}").format(
                    table=table.full_name,
                    columns=", ".join(columns),
                    redshift_table_columns=", ".join(redshift_table_columns))
            raise Exception(error_string)

    # Load data into redshift
    load_query = load_redshift(
        table, script_arguments.input_paths, script_arguments.max_error,
        script_arguments.replace_invalid_char, script_arguments.no_escape,
        script_arguments.gzip, script_arguments.command_options)
    try:
        cursor.execute(load_query)
        cursor.execute('COMMIT')
    except Exception as error:
        error_query = create_error_retrieval_query(
            script_arguments.input_paths)
        cursor.execute(error_query)
        separator = "-" * 50 + "\n"

        stderr.write(
            "Error while loading data into redshift \n\n{}".format(separator))

        for item in cursor.fetchall():
            for key in item:
                stderr.write("{}: {}\n".format(key, str(item[key]).strip()))
            stderr.write(separator)
        raise error

    cursor.close()
    connection.close()