Exemple #1
0
def main():
    """Main Function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_definition', dest='table_definition',
                        required=True)
    parser.add_argument('--max_error', dest='max_error', default=0, type=int)
    parser.add_argument('--replace_invalid_char', dest='replace_invalid_char',
                        default=None)
    parser.add_argument('--no_escape', action='store_true', default=False)
    parser.add_argument('--gzip', action='store_true', default=False)
    parser.add_argument('--command_options', dest='command_options', default=None)
    parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+')
    args = parser.parse_args()
    print args

    table = Table(SqlStatement(args.table_definition))
    connection = redshift_connection()
    table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(),
                                      connection).loc[0][0]

    cursor = connection.cursor()
    # Create table in redshift, this is safe due to the if exists condition
    if table_not_exists:
        cursor.execute(table.create_script().sql())

    # Load data into redshift
    load_query = load_redshift(table, args.input_paths, args.max_error,
                               args.replace_invalid_char, args.no_escape,
                               args.gzip, args.command_options)
    cursor.execute(load_query)
    cursor.execute('COMMIT')
    cursor.close()
    connection.close()
def main():
    """Main function
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('--table', dest='table', required=True)
    parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None)
    parser.add_argument('--test_name',
                        dest='test_name',
                        default="Check Primary Key")
    parser.add_argument('--log_to_s3', action='store_true', default=False)
    parser.add_argument('--path_suffix', dest='path_suffix', default=None)

    args = parser.parse_args()

    connection = redshift_connection()
    table = Table(SqlScript(args.table))
    result = pdsql.read_sql(table.select_duplicates_script().sql(), connection)
    check = PrimaryKeyCheck(len(result),
                            name=args.test_name,
                            sns_topic_arn=args.sns_topic_arn)
    check.publish(args.log_to_s3,
                  table=table.full_name,
                  path_suffix=args.path_suffix)
    connection.close()
Exemple #3
0
def sql_runner():
    """Main Function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_definition', dest='table_definition',
                        required=True)
    parser.add_argument('--sql', dest='sql', required=True)
    parser.add_argument('--analyze', action='store_true', default=False)
    parser.add_argument('--non_transactional', action='store_true',
                        default=False)

    args, sql_arguments = parser.parse_known_args()
    print args, sql_arguments

    sql_query = args.sql
    if sql_query.startswith('s3://'):
        sql_query = S3File(s3_path=S3Path(uri=args.sql)).text

    table = Table(SqlStatement(args.table_definition))
    connection = redshift_connection()
    # Enable autocommit for non transactional sql execution
    if args.non_transactional:
        connection.autocommit = True
    else:
        # connection by default sets autocommit to True, but for
        # the SQL runner, it should be False to put all SQLs into one
        # transaction.
        connection.autocommit = False

    table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(),
                                      connection).loc[0][0]

    cursor = connection.cursor()
    # Create table in redshift, this is safe due to the if exists condition
    if table_not_exists:
        cursor.execute(table.create_script().sql())

    # Load data into redshift with upsert query
    # If there are sql_arguments, place them along with the query
    # Otherwise, don't include them to avoid having to use %% everytime
    if len(sql_arguments) >= 1:
        print cursor.mogrify(sql_query, tuple(sql_arguments))
        cursor.execute(sql_query, tuple(sql_arguments))
    else:
        print sql_query
        cursor.execute(sql_query)
    cursor.execute('COMMIT')

    # Analyze the table
    if args.analyze:
        cursor.execute(table.analyze_script().sql())

    cursor.close()
    connection.close()
Exemple #4
0
def main():
    """Main Function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_definition',
                        dest='table_definition',
                        required=True)
    parser.add_argument('--sql', dest='sql', required=True)
    parser.add_argument('--analyze', action='store_true', default=False)
    parser.add_argument('--non_transactional',
                        action='store_true',
                        default=False)

    args, sql_arguments = parser.parse_known_args()
    print args, sql_arguments

    table = Table(SqlStatement(args.table_definition))
    connection = redshift_connection()
    # Enable autocommit for non transactional sql execution
    if args.non_transactional:
        connection.autocommit = True

    table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(),
                                      connection).loc[0][0]

    cursor = connection.cursor()
    # Create table in redshift, this is safe due to the if exists condition
    if table_not_exists:
        cursor.execute(table.create_script().sql())

    # Load data into redshift with upsert query
    # If there are sql_arguments, place them along with the query
    # Otherwise, don't include them to avoid having to use %% everytime
    if len(sql_arguments) >= 1:
        print cursor.mogrify(args.sql, tuple(sql_arguments))
        cursor.execute(args.sql, tuple(sql_arguments))
    else:
        print args.sql
        cursor.execute(args.sql)
    cursor.execute('COMMIT')

    # Analyze the table
    if args.analyze:
        cursor.execute(table.analyze_script().sql())

    cursor.close()
    connection.close()
Exemple #5
0
def create_load_redshift_runner():
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_definition',
                        dest='table_definition',
                        required=True)
    parser.add_argument('--max_error', dest='max_error', default=0, type=int)
    parser.add_argument('--replace_invalid_char',
                        dest='replace_invalid_char',
                        default=None)
    parser.add_argument('--no_escape', action='store_true', default=False)
    parser.add_argument('--gzip', action='store_true', default=False)
    parser.add_argument('--command_options',
                        dest='command_options',
                        default=None)
    parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+')
    parser.add_argument('--force_drop_table',
                        dest='force_drop_table',
                        default=False)
    script_arguments = parser.parse_args()
    print script_arguments

    table = Table(SqlStatement(script_arguments.table_definition))
    connection = redshift_connection(
        cursor_factory=psycopg2.extras.RealDictCursor)
    table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(),
                                      connection).loc[0][0]

    cursor = connection.cursor()
    # Create table in redshift, this is safe due to the if exists condition
    if table_not_exists:
        cursor.execute(table.create_script().sql())
    else:
        columns = sorted(
            [column.column_name.lower() for column in table.columns()])
        redshift_table_columns = get_redshift_table_colunms(table, cursor)

        if columns != redshift_table_columns:
            error_string = (
                "Table schema mismatch: {table}\n"
                "Columns for new table: {columns}\n"
                "Columns for existing table: {redshift_table_columns}").format(
                    table=table.full_name,
                    columns=", ".join(columns),
                    redshift_table_columns=", ".join(redshift_table_columns))
            raise Exception(error_string)

    # Load data into redshift
    load_query = load_redshift(table, script_arguments.input_paths,
                               script_arguments.max_error,
                               script_arguments.replace_invalid_char,
                               script_arguments.no_escape,
                               script_arguments.gzip,
                               script_arguments.command_options)
    try:
        cursor.execute(load_query)
        cursor.execute('COMMIT')
    except Exception as error:
        error_query = create_error_retrieval_query(
            script_arguments.input_paths)
        cursor.execute(error_query)
        separator = "-" * 50 + "\n"

        stderr.write(
            "Error while loading data into redshift \n\n{}".format(separator))

        for item in cursor.fetchall():
            for key in item:
                stderr.write("{}: {}\n".format(key, str(item[key]).strip()))
            stderr.write(separator)
        raise error

    cursor.close()
    connection.close()