def run_commands(conn, commands, cw=None, cluster_name=None, suppress_errors=False):
    for idx, c in enumerate(commands, start=1):
        if c is not None:
            comment('[%s] Running %s out of %s commands: %s' % (str(os.getpid()), idx, len(commands), c))
            try:
                cursor = conn.cursor()
                cursor.execute(c)
                comment('Success.')
            except:
                # cowardly bail on errors
                conn.rollback()
                print(traceback.format_exc())
                if not suppress_errors:
                    raise

            # emit a cloudwatch metric for the statement
            if cw is not None and cluster_name is not None:
                dimensions = [
                    {'Name': 'ClusterIdentifier', 'Value': cluster_name}
                ]
                if c.lower().startswith('analyze'):
                    metric_name = 'AnalyzeTable'
                elif c.lower().startswith('vacuum'):
                    metric_name = 'VacuumTable'
                else:
                    # skip to the next statement - not exporting anything about these statements to cloudwatch
                    continue

                aws_utils.put_metric(cw, 'Redshift', metric_name, dimensions, None, 1, 'Count')
                if debug:
                    comment("Emitted Cloudwatch Metric for Column Encoded table")

    return True
Exemple #2
0
def run_commands(conn, commands, cw=None, cluster_name=None, suppress_errors=False):
    for idx, c in enumerate(commands, start=1):
        if c is not None:
            comment('[%s] Running %s out of %s commands: %s' % (str(os.getpid()), idx, len(commands), c))
            try:
                cursor = conn.cursor()
                cursor.execute(c)
                comment('Success.')
            except:
                # cowardly bail on errors
                conn.rollback()
                if not suppress_errors:
                    print(traceback.format_exc())
                raise

            # emit a cloudwatch metric for the statement
            if cw is not None and cluster_name is not None:
                dimensions = [
                    {'Name': 'ClusterIdentifier', 'Value': cluster_name}
                ]
                if c.lower().startswith('analyze'):
                    metric_name = 'AnalyzeTable'
                elif c.lower().startswith('vacuum'):
                    metric_name = 'VacuumTable'
                else:
                    # skip to the next statement - not exporting anything about these statements to cloudwatch
                    continue

                aws_utils.put_metric(cw, 'Redshift', metric_name, dimensions, None, 1, 'Count')
                if debug:
                    comment("Emitted Cloudwatch Metric for Column Encoded table")

    return True
Exemple #3
0
def analyze(table_info):
    table_name = table_info[0]
    dist_style = table_info[3]
    owner = table_info[4]
    table_comment = table_info[5]

    # get the count of columns that have raw encoding applied
    table_unoptimised = False
    count_unoptimised = 0
    encodings_modified = False
    output = get_count_raw_columns(table_name)

    if output is None:
        print("Unable to determine potential RAW column encoding for %s" %
              table_name)
        return ERROR
    else:
        for row in output:
            if row[0] > 0:
                table_unoptimised = True
                count_unoptimised += row[0]

    if not table_unoptimised and not force:
        comment("Table %s does not require encoding optimisation" % table_name)
        return OK
    else:
        comment("Table %s contains %s unoptimised columns" %
                (table_name, count_unoptimised))
        if force:
            comment("Using Force Override Option")

        statement = 'analyze compression %s."%s"' % (schema_name, table_name)

        if comprows is not None:
            statement = statement + (" comprows %s" % int(comprows))

        try:
            if debug:
                comment(statement)

            comment("Analyzing Table '%s'" % (table_name, ))

            # run the analyze in a loop, because it could be locked by another process modifying rows and get a timeout
            analyze_compression_result = None
            analyze_retry = 10
            attempt_count = 0
            last_exception = None
            while attempt_count < analyze_retry and analyze_compression_result is None:
                try:
                    analyze_compression_result = execute_query(statement)
                except KeyboardInterrupt:
                    # To handle Ctrl-C from user
                    cleanup(conn)
                    return TERMINATED_BY_USER
                except Exception as e:
                    print(e)
                    attempt_count += 1
                    last_exception = e

                    # Exponential Backoff
                    time.sleep(2**attempt_count * RETRY_TIMEOUT)

            if analyze_compression_result is None:
                if last_exception is not None:
                    print("Unable to analyze %s due to Exception %s" %
                          (table_name, last_exception.message))
                else:
                    print("Unknown Error")
                return ERROR

            if target_schema == schema_name:
                target_table = '%s_$mig' % table_name
            else:
                target_table = table_name

            create_table = 'begin;\nlock table %s."%s";\ncreate table %s."%s"(' % (
                schema_name,
                table_name,
                target_schema,
                target_table,
            )

            # query the table column definition
            descr = get_table_desc(table_name)

            encode_columns = []
            statements = []
            sortkeys = {}
            has_zindex_sortkeys = False
            has_identity = False
            non_identity_columns = []
            fks = []
            table_distkey = None
            table_sortkeys = []
            new_sortkey_arr = [t.strip() for t in new_sort_keys.split(',')
                               ] if new_sort_keys is not None else []

            # count of suggested optimizations
            count_optimized = 0
            # process each item given back by the analyze request
            for row in analyze_compression_result:
                if debug:
                    comment("Analyzed Compression Row State: %s" % str(row))
                col = row[1]

                # compare the previous encoding to the new encoding
                new_encoding = row[2]
                old_encoding = descr[col][2]
                old_encoding = 'raw' if old_encoding == 'none' else old_encoding
                if new_encoding != old_encoding:
                    encodings_modified = True
                    count_optimized += 1

                    if debug:
                        comment(
                            "Column %s will be modified from %s encoding to %s encoding"
                            % (col, old_encoding, new_encoding))

                # fix datatypesj from the description type to the create type
                col_type = descr[col][1]

                # check whether varchars columns are too wide
                if analyze_col_width and "character varying" in col_type:
                    curr_col_length = int(re.search(r'\d+', col_type).group())
                    if curr_col_length > 255:
                        col_len_statement = 'select /* computing max column length */ max(len(%s)) from %s."%s"' % (
                            descr[col][0], schema_name, table_name)
                        try:
                            if debug:
                                comment(col_len_statement)

                            comment(
                                "Analyzing max length of character column '%s' for table '%s.%s' "
                                % (col, schema_name, table_name))

                            # run the analyze in a loop, because it could be locked by another process modifying rows and get a timeout
                            col_len_result = None
                            col_len_retry = 10
                            col_len_attempt_count = 0
                            col_len_last_exception = None
                            while col_len_attempt_count < col_len_retry and col_len_result is None:
                                try:
                                    col_len_result = execute_query(
                                        col_len_statement)
                                except KeyboardInterrupt:
                                    # To handle Ctrl-C from user
                                    cleanup(conn)
                                    return TERMINATED_BY_USER
                                except Exception as e:
                                    print(e)
                                    col_len_attempt_count += 1
                                    col_len_last_exception = e

                                    # Exponential Backoff
                                    time.sleep(2**col_len_attempt_count *
                                               RETRY_TIMEOUT)

                            if col_len_result is None:
                                if col_len_last_exception is not None:
                                    print(
                                        "Unable to determine length of %s for table %s due to Exception %s"
                                        % (col, table_name,
                                           last_exception.message))
                                else:
                                    print("Unknown Error")
                                return ERROR

                            if debug:
                                comment(
                                    "Max width of character column '%s' for table '%s.%s' is %d. Current width is %d."
                                    % (descr[col][0], schema_name, table_name,
                                       col_len_result[0][0], curr_col_length))

                            if col_len_result[0][0] < curr_col_length:
                                col_type = re.sub(str(curr_col_length),
                                                  str(col_len_result[0][0]),
                                                  col_type)
                                encodings_modified = True

                        except Exception as e:
                            print('Exception %s during analysis of %s' %
                                  (e.message, table_name))
                            print(traceback.format_exc())
                            return ERROR

                col_type = col_type.replace('character varying',
                                            'varchar').replace(
                                                'without time zone', '')

                # check whether number columns are too wide
                if analyze_col_width and "int" in col_type:
                    col_len_statement = 'select max(%s) from %s."%s"' % (
                        descr[col][0], schema_name, table_name)
                    try:
                        if debug:
                            comment(col_len_statement)

                        comment(
                            "Analyzing max column '%s' for table '%s.%s' " %
                            (col, schema_name, table_name))

                        # run the analyze in a loop, because it could be locked by another process modifying rows and get a timeout
                        col_len_result = None
                        col_len_retry = 10
                        col_len_attempt_count = 0
                        col_len_last_exception = None
                        while col_len_attempt_count < col_len_retry and col_len_result is None:
                            try:
                                col_len_result = execute_query(
                                    col_len_statement)
                            except KeyboardInterrupt:
                                # To handle Ctrl-C from user
                                cleanup(conn)
                                return TERMINATED_BY_USER
                            except Exception as e:
                                print(e)
                                col_len_attempt_count += 1
                                col_len_last_exception = e

                                # Exponential Backoff
                                time.sleep(2**col_len_attempt_count *
                                           RETRY_TIMEOUT)

                        if col_len_result is None:
                            if col_len_last_exception is not None:
                                print(
                                    "Unable to determine length of %s for table %s due to Exception %s"
                                    %
                                    (col, table_name, last_exception.message))
                            else:
                                print("Unknown Error")
                            return ERROR

                        if debug:
                            max = col_len_result[0][
                                0] if col_len_result else 'not defined'
                            comment(
                                "Max of column '%s' for table '%s.%s' is %s. Current column type is %s."
                                % (descr[col][0], schema_name, table_name, max,
                                   col_type))

                        # Test to see if largest value is smaller than largest value of smallint (2 bytes)
                        if col_len_result[0][0] <= int(math.pow(
                                2, 15) - 1) and col_type != "smallint":
                            col_type = re.sub(col_type, "smallint", col_type)
                            encodings_modified = True

                            # Test to see if largest value is smaller than largest value of smallint (4 bytes)
                        elif col_len_result[0][0] <= int(
                                math.pow(2, 31) - 1) and col_type != "integer":
                            col_type = re.sub(col_type, "integer", col_type)
                            encodings_modified = True

                    except Exception as e:
                        print('Exception %s during analysis of %s' %
                              (e.message, table_name))
                        print(traceback.format_exc())
                        return ERROR

                        # link in the existing distribution key, or set the new one
                row_distkey = descr[col][3]
                if table_name is not None and new_dist_key is not None:
                    if col == new_dist_key:
                        distkey = 'DISTKEY'
                        dist_style = 'KEY'
                        table_distkey = col
                    else:
                        distkey = ''
                else:
                    if str(row_distkey).upper()[0] == 'T':
                        distkey = 'DISTKEY'
                        dist_style = 'KEY'
                        table_distkey = col
                    else:
                        distkey = ''

                # link in the existing sort keys, or set the new ones
                row_sortkey = descr[col][4]
                if table_name is not None and len(new_sortkey_arr) > 0:
                    if col in new_sortkey_arr:
                        sortkeys[new_sortkey_arr.index(col) + 1] = col
                        table_sortkeys.append(col)
                else:
                    if row_sortkey != 0:
                        # add the absolute ordering of the sortkey to the list of all sortkeys
                        sortkeys[abs(row_sortkey)] = col
                        table_sortkeys.append(col)

                        if row_sortkey < 0:
                            has_zindex_sortkeys = True

                # don't compress first sort key
                if abs(row_sortkey) == 1:
                    compression = 'RAW'
                else:
                    compression = row[2]

                # extract null/not null setting
                col_null = descr[col][5]

                if str(col_null).upper() == 'TRUE':
                    col_null = 'NOT NULL'
                else:
                    col_null = ''

                # get default or identity syntax for this column
                default_or_identity = descr[col][6]
                if default_or_identity:
                    ident_data = get_identity(default_or_identity)
                    if ident_data is None:
                        default_value = 'default %s' % default_or_identity
                        non_identity_columns.append(col)
                    else:
                        default_value = 'identity (%s, %s)' % ident_data
                        has_identity = True
                else:
                    default_value = ''
                    non_identity_columns.append(col)

                # add the formatted column specification
                encode_columns.extend([
                    '"%s" %s %s %s encode %s %s' %
                    (col, col_type, default_value, col_null, compression,
                     distkey)
                ])

            # abort if a new distkey was set but we couldn't find it in the set of all columns
            if new_dist_key is not None and table_distkey is None:
                msg = "Column '%s' not found when setting new Table Distribution Key" % new_dist_key
                comment(msg)
                raise Exception(msg)

            # abort if new sortkeys were set but we couldn't find them in the set of all columns
            if new_sort_keys is not None and len(table_sortkeys) != len(
                    new_sortkey_arr):
                if debug:
                    comment("Reqested Sort Keys: %s" % new_sort_keys)
                    comment("Resolved Sort Keys: %s" % table_sortkeys)
                msg = "Column resolution of sortkeys '%s' not found when setting new Table Sort Keys" % new_sort_keys
                comment(msg)
                raise Exception(msg)

            # if this table's encodings have not changed, then don't do a modification, unless force options is set
            if (not force) and (not encodings_modified):
                comment(
                    "Column Encoding resulted in an identical table - no changes will be made"
                )
            else:
                comment("Column Encoding will be modified for %s.%s" %
                        (schema_name, table_name))

                # add all the column encoding statements on to the create table statement, suppressing the leading comma on the first one
                for i, s in enumerate(encode_columns):
                    create_table += '\n%s%s' % ('' if i == 0 else ',', s)

                create_table = create_table + '\n)\n'

                # add diststyle all if needed
                if dist_style == 'ALL':
                    create_table = create_table + 'diststyle all\n'

                # add sort key as a table block to accommodate multiple columns
                if len(sortkeys) > 0:
                    if debug:
                        comment("Adding Sortkeys: %s" % sortkeys)
                    sortkey = '%sSORTKEY(' % ('INTERLEAVED '
                                              if has_zindex_sortkeys else '')

                    for i in range(1, len(sortkeys) + 1):
                        sortkey = sortkey + sortkeys[i]

                        if i != len(sortkeys):
                            sortkey = sortkey + ','
                        else:
                            sortkey = sortkey + ')\n'
                    create_table = create_table + (' %s ' % sortkey)

                create_table = create_table + ';'

                # run the create table statement
                statements.extend([create_table])

                # get the primary key statement
                statements.extend([
                    get_primary_key(schema_name, target_schema, table_name,
                                    target_table)
                ])

                # set the table owner
                statements.extend([
                    'alter table %s."%s" owner to %s;' %
                    (target_schema, target_table, owner)
                ])

                if table_comment is not None:
                    statements.extend([
                        'comment on table %s."%s" is \'%s\';' %
                        (target_schema, target_table, table_comment)
                    ])

                # insert the old data into the new table
                # if we have identity column(s), we can't insert data from them, so do selective insert
                if has_identity:
                    source_columns = ', '.join(non_identity_columns)
                    mig_columns = '(' + source_columns + ')'
                else:
                    source_columns = '*'
                    mig_columns = ''

                insert = 'insert into %s."%s" %s select %s from %s."%s"' % (
                    target_schema, target_table, mig_columns, source_columns,
                    schema_name, table_name)
                if len(table_sortkeys) > 0:
                    insert = "%s order by %s;" % (insert,
                                                  ",".join(table_sortkeys))

                statements.extend([insert])

                # analyze the new table
                analyze = 'analyze %s."%s";' % (target_schema, target_table)
                statements.extend([analyze])

                if target_schema == schema_name:
                    # rename the old table to _$old or drop
                    if drop_old_data:
                        drop = 'drop table %s."%s" cascade;' % (target_schema,
                                                                table_name)
                    else:
                        # the alter table statement for the current data will use the first 104 characters of the original table name, the current datetime as YYYYMMDD and a 10 digit random string
                        drop = 'alter table %s."%s" rename to "%s_%s_%s_$old";' % (
                            target_schema, table_name, table_name[0:104],
                            datetime.date.today().strftime("%Y%m%d"),
                            shortuuid.ShortUUID().random(length=10))

                    statements.extend([drop])

                    # rename the migrate table to the old table name
                    rename = 'alter table %s."%s" rename to "%s";' % (
                        target_schema, target_table, table_name)
                    statements.extend([rename])

                # add foreign keys
                fks = get_foreign_keys(schema_name, target_schema, table_name)

                statements.extend(['commit;'])

                if do_execute:
                    if not run_commands(get_pg_conn(), statements):
                        if not ignore_errors:
                            if debug:
                                print("Error running statements: %s" %
                                      (str(statements), ))
                            return ERROR

                    # emit a cloudwatch metric for the table
                    if cw is not None:
                        dimensions = [{
                            'Name': 'ClusterIdentifier',
                            'Value': db_host.split('.')[0]
                        }, {
                            'Name': 'TableName',
                            'Value': table_name
                        }]
                        aws_utils.put_metric(cw, 'Redshift',
                                             'ColumnEncodingModification',
                                             dimensions, None, 1, 'Count')
                        if debug:
                            comment(
                                "Emitted Cloudwatch Metric for Column Encoded table"
                            )
                else:
                    comment("No encoding modifications run for %s.%s" %
                            (schema_name, table_name))
        except Exception as e:
            print('Exception %s during analysis of %s' %
                  (e.message, table_name))
            print(traceback.format_exc())
            return ERROR

        print_statements(statements)

        return (OK, fks, encodings_modified)
def analyze(table_info):
    schema_name = table_info[0]
    table_name = table_info[1]
    dist_style = table_info[4]
    owner = table_info[5]
    if len(table_info) > 6:
        table_comment = table_info[6]

    # get the count of columns that have raw encoding applied
    table_unoptimised = False
    count_unoptimised = 0
    encodings_modified = False
    output = get_count_raw_columns(schema_name, table_name)

    if output is None:
        print("Unable to determine potential RAW column encoding for %s" % table_name)
        return ERROR
    else:
        for row in output:
            if row[0] > 0:
                table_unoptimised = True
                count_unoptimised += row[0]

    if not table_unoptimised and not force:
        comment("Table %s.%s does not require encoding optimisation" % (schema_name, table_name))
        return OK
    else:
        comment("Table %s.%s contains %s unoptimised columns" % (schema_name, table_name, count_unoptimised))
        if force:
            comment("Using Force Override Option")

        statement = 'analyze compression %s."%s"' % (schema_name, table_name)

        if comprows is not None:
            statement = statement + (" comprows %s" % int(comprows))

        try:
            if debug:
                comment(statement)

            comment("Analyzing Table '%s.%s'" % (schema_name, table_name,))

            # run the analyze in a loop, because it could be locked by another process modifying rows and get a timeout
            analyze_compression_result = None
            analyze_retry = 10
            attempt_count = 0
            last_exception = None
            while attempt_count < analyze_retry and analyze_compression_result is None:
                try:
                    analyze_compression_result = execute_query(statement)
                    # Commiting otherwise anaylze keep an exclusive lock until a commit arrive which can be very long
                    execute_query('commit;')
                except KeyboardInterrupt:
                    # To handle Ctrl-C from user
                    cleanup(get_pg_conn())
                    return TERMINATED_BY_USER
                except Exception as e:
                    execute_query('rollback;')
                    print(e)
                    attempt_count += 1
                    last_exception = e

                    # Exponential Backoff
                    time.sleep(2 ** attempt_count * RETRY_TIMEOUT)

            if analyze_compression_result is None:
                if last_exception is not None:
                    print("Unable to analyze %s due to Exception %s" % (table_name, last_exception.message))
                else:
                    print("Unknown Error")
                return ERROR

            if target_schema is None:
                set_target_schema = schema_name
            else:
                set_target_schema = target_schema

            if set_target_schema == schema_name:
                target_table = '%s_$mig' % table_name
            else:
                target_table = table_name

            create_table = 'begin;\nlock table %s."%s";\ncreate table %s."%s"(' % (
                schema_name, table_name, set_target_schema, target_table,)

            # query the table column definition
            descr = get_table_desc(schema_name, table_name)

            encode_columns = []
            statements = []
            sortkeys = {}
            has_zindex_sortkeys = False
            has_identity = False
            non_identity_columns = []
            fks = []
            table_distkey = None
            table_sortkeys = []
            new_sortkey_arr = [t.strip() for t in new_sort_keys.split(',')] if new_sort_keys is not None else []

            # count of suggested optimizations
            count_optimized = 0
            # process each item given back by the analyze request
            for row in analyze_compression_result:
                if debug:
                    comment("Analyzed Compression Row State: %s" % str(row))
                col = row[1]
                row_sortkey = descr[col][4]

                # compare the previous encoding to the new encoding
                # don't use new encoding for first sortkey
                new_encoding = row[2] if not abs(row_sortkey) == 1 else 'raw'
                old_encoding = descr[col][2]
                old_encoding = 'raw' if old_encoding == 'none' else old_encoding
                if new_encoding != old_encoding:
                    encodings_modified = True
                    count_optimized += 1

                    if debug:
                        comment("Column %s will be modified from %s encoding to %s encoding" % (
                            col, old_encoding, new_encoding))

                # fix datatypes from the description type to the create type
                col_type = descr[col][1].replace('character varying', 'varchar').replace('without time zone', '')

                # check whether columns are too wide
                if analyze_col_width and ("varchar" in col_type or "int" in col_type):
                    new_col_type = reduce_column_length(col_type, descr[col][0], table_name)
                    if new_col_type != col_type:
                        col_type = new_col_type
                        encodings_modified = True

                # link in the existing distribution key, or set the new one
                row_distkey = descr[col][3]
                if table_name is not None and new_dist_key is not None:
                    if col == new_dist_key:
                        distkey = 'DISTKEY'
                        dist_style = 'KEY'
                        table_distkey = col
                    else:
                        distkey = ''
                else:
                    if str(row_distkey).upper()[0] == 'T':
                        distkey = 'DISTKEY'
                        dist_style = 'KEY'
                        table_distkey = col
                    else:
                        distkey = ''

                # link in the existing sort keys, or set the new ones
                if table_name is not None and len(new_sortkey_arr) > 0:
                    if col in new_sortkey_arr:
                        sortkeys[new_sortkey_arr.index(col) + 1] = col
                        table_sortkeys.append(col)
                else:
                    if row_sortkey != 0:
                        # add the absolute ordering of the sortkey to the list of all sortkeys
                        sortkeys[abs(row_sortkey)] = col
                        table_sortkeys.append(col)

                        if row_sortkey < 0:
                            has_zindex_sortkeys = True

                # don't compress first sort key column. This will be set on the basis of the existing sort key not
                # being modified, or on the assignment of the new first sortkey
                if (abs(row_sortkey) == 1 and len(new_sortkey_arr) == 0) or (
                        col in table_sortkeys and table_sortkeys.index(col) == 0):
                    compression = 'RAW'
                else:
                    compression = row[2]

                # extract null/not null setting            
                col_null = descr[col][5]

                if str(col_null).upper() == 'TRUE':
                    col_null = 'NOT NULL'
                else:
                    col_null = ''

                # get default or identity syntax for this column
                default_or_identity = descr[col][6]
                if default_or_identity:
                    ident_data = get_identity(default_or_identity)
                    if ident_data is None:
                        default_value = 'default %s' % default_or_identity
                        non_identity_columns.append('"%s"' % col)
                    else:
                        default_value = 'identity (%s, %s)' % ident_data
                        has_identity = True
                else:
                    default_value = ''
                    non_identity_columns.append('"%s"' % col)

                # add the formatted column specification
                encode_columns.extend(['"%s" %s %s %s encode %s %s'
                                       % (col, col_type, default_value, col_null, compression, distkey)])

            # abort if a new distkey was set but we couldn't find it in the set of all columns
            if new_dist_key is not None and table_distkey is None:
                msg = "Column '%s' not found when setting new Table Distribution Key" % new_dist_key
                comment(msg)
                raise Exception(msg)

            # abort if new sortkeys were set but we couldn't find them in the set of all columns
            if new_sort_keys is not None and len(table_sortkeys) != len(new_sortkey_arr):
                if debug:
                    comment("Requested Sort Keys: %s" % new_sortkey_arr)
                    comment("Resolved Sort Keys: %s" % table_sortkeys)
                msg = "Column resolution of sortkeys '%s' not found when setting new Table Sort Keys" % new_sortkey_arr
                comment(msg)
                raise Exception(msg)

            # if this table's encodings have not changed, then don't do a modification, unless force options is set
            if (not force) and (not encodings_modified):
                comment("Column Encoding resulted in an identical table - no changes will be made")
            else:
                comment("Column Encoding will be modified for %s.%s" % (schema_name, table_name))

                # add all the column encoding statements on to the create table statement, suppressing the leading
                # comma on the first one
                for i, s in enumerate(encode_columns):
                    create_table += '\n%s%s' % ('' if i == 0 else ',', s)

                create_table = create_table + '\n)\n'

                # add diststyle all if needed
                if dist_style == 'ALL':
                    create_table = create_table + 'diststyle all\n'

                # add sort key as a table block to accommodate multiple columns
                if len(sortkeys) > 0:
                    if debug:
                        comment("Adding Sortkeys: %s" % sortkeys)
                    sortkey = '%sSORTKEY(' % ('INTERLEAVED ' if has_zindex_sortkeys else '')

                    for i in range(1, len(sortkeys) + 1):
                        sortkey = sortkey + sortkeys[i]

                        if i != len(sortkeys):
                            sortkey = sortkey + ','
                        else:
                            sortkey = sortkey + ')\n'
                    create_table = create_table + (' %s ' % sortkey)

                create_table = create_table + ';'

                # run the create table statement
                statements.extend([create_table])

                # get the primary key statement
                statements.extend([get_primary_key(schema_name, set_target_schema, table_name, target_table)])

                # set the table owner
                statements.extend(['alter table %s."%s" owner to %s;' % (set_target_schema, target_table, owner)])

                if table_comment is not None:
                    statements.extend(
                        ['comment on table %s."%s" is \'%s\';' % (set_target_schema, target_table, table_comment)])

                # insert the old data into the new table
                # if we have identity column(s), we can't insert data from them, so do selective insert
                if has_identity:
                    source_columns = ', '.join(non_identity_columns)
                    mig_columns = '(' + source_columns + ')'
                else:
                    source_columns = '*'
                    mig_columns = ''

                insert = 'insert into %s."%s" %s select %s from %s."%s"' % (set_target_schema,
                                                                            target_table,
                                                                            mig_columns,
                                                                            source_columns,
                                                                            schema_name,
                                                                            table_name)
                if len(table_sortkeys) > 0:
                    insert = "%s order by \"%s\";" % (insert, ",".join(table_sortkeys).replace(',', '\",\"'))
                else:
                    insert = "%s;" % (insert)

                statements.extend([insert])

                # analyze the new table
                analyze = 'analyze %s."%s";' % (set_target_schema, target_table)
                statements.extend([analyze])

                if set_target_schema == schema_name:
                    # rename the old table to _$old or drop
                    if drop_old_data:
                        drop = 'drop table %s."%s" cascade;' % (set_target_schema, table_name)
                    else:
                        # the alter table statement for the current data will use the first 104 characters of the
                        # original table name, the current datetime as YYYYMMDD and a 10 digit random string
                        drop = 'alter table %s."%s" rename to "%s_%s_%s_$old";' % (
                            set_target_schema, table_name, table_name[0:104], datetime.date.today().strftime("%Y%m%d"),
                            shortuuid.ShortUUID().random(length=10))

                    statements.extend([drop])

                    # rename the migrate table to the old table name
                    rename = 'alter table %s."%s" rename to "%s";' % (set_target_schema, target_table, table_name)
                    statements.extend([rename])

                # add foreign keys
                fks = get_foreign_keys(schema_name, set_target_schema, table_name)

                # add grants back
                grants = get_grants(schema_name, table_name, db_user)
                if grants is not None:
                    statements.extend(grants)

                statements.extend(['commit;'])

                if do_execute:
                    if not run_commands(get_pg_conn(), statements):
                        if not ignore_errors:
                            if debug:
                                print("Error running statements: %s" % (str(statements),))
                            return ERROR

                    # emit a cloudwatch metric for the table
                    if cw is not None:
                        dimensions = [
                            {'Name': 'ClusterIdentifier', 'Value': db_host.split('.')[0]},
                            {'Name': 'TableName', 'Value': table_name}
                        ]
                        aws_utils.put_metric(cw, 'Redshift', 'ColumnEncodingModification', dimensions, None, 1, 'Count')
                        if debug:
                            comment("Emitted Cloudwatch Metric for Column Encoded table")
                else:
                    comment("No encoding modifications run for %s.%s" % (schema_name, table_name))
        except Exception as e:
            print('Exception %s during analysis of %s' % (e, table_name))
            print(traceback.format_exc())
            return ERROR

        print_statements(statements)

        return (OK, fks, encodings_modified)
Exemple #5
0
def analyze(table_info):
    schema_name = table_info[0]
    table_name = table_info[1]
    dist_style = table_info[4]
    owner = table_info[5]
    if len(table_info) > 6:
        table_comment = table_info[6]

    # get the count of columns that have raw encoding applied
    table_unoptimised = False
    count_unoptimised = 0
    encodings_modified = False
    output = get_count_raw_columns(schema_name, table_name)

    if output is None:
        print("Unable to determine potential RAW column encoding for %s" %
              table_name)
        return ERROR
    else:
        for row in output:
            if row[0] > 0:
                table_unoptimised = True
                count_unoptimised += row[0]

    if not table_unoptimised and not force:
        comment("Table %s.%s does not require encoding optimisation" %
                (schema_name, table_name))
        return OK
    else:
        comment("Table %s.%s contains %s unoptimised columns" %
                (schema_name, table_name, count_unoptimised))
        if force:
            comment("Using Force Override Option")

        statement = 'analyze compression %s."%s"' % (schema_name, table_name)

        if comprows is not None:
            statement = statement + (" comprows %s" % int(comprows))

        try:
            if debug:
                comment(statement)

            comment("Analyzing Table '%s.%s'" % (
                schema_name,
                table_name,
            ))

            # run the analyze in a loop, because it could be locked by another process modifying rows and get a timeout
            analyze_compression_result = None
            analyze_retry = 10
            attempt_count = 0
            last_exception = None
            while attempt_count < analyze_retry and analyze_compression_result is None:
                try:
                    analyze_compression_result = execute_query(statement)
                    # Commiting otherwise anaylze keep an exclusive lock until a commit arrive which can be very long
                    execute_query('commit;')
                except KeyboardInterrupt:
                    # To handle Ctrl-C from user
                    cleanup(get_pg_conn())
                    return TERMINATED_BY_USER
                except Exception as e:
                    execute_query('rollback;')
                    print(e)
                    attempt_count += 1
                    last_exception = e

                    # Exponential Backoff
                    time.sleep(2**attempt_count * RETRY_TIMEOUT)

            if analyze_compression_result is None:
                if last_exception is not None:
                    print("Unable to analyze %s due to Exception %s" %
                          (table_name, last_exception.message))
                else:
                    print("Unknown Error")
                return ERROR

            if target_schema is None:
                set_target_schema = schema_name
            else:
                set_target_schema = target_schema

            if set_target_schema == schema_name:
                target_table = '%s_$mig' % table_name
            else:
                target_table = table_name

            create_table = 'begin;\nlock table %s."%s";\ncreate table %s."%s"(' % (
                schema_name,
                table_name,
                set_target_schema,
                target_table,
            )

            # query the table column definition
            descr = get_table_desc(schema_name, table_name)

            encode_columns = []
            statements = []
            sortkeys = {}
            has_zindex_sortkeys = False
            has_identity = False
            non_identity_columns = []
            fks = []
            table_distkey = None
            table_sortkeys = []
            new_sortkey_arr = [t.strip() for t in new_sort_keys.split(',')
                               ] if new_sort_keys is not None else []

            # count of suggested optimizations
            count_optimized = 0
            # process each item given back by the analyze request
            for row in analyze_compression_result:
                if debug:
                    comment("Analyzed Compression Row State: %s" % str(row))
                col = row[1]
                row_sortkey = descr[col][4]

                # compare the previous encoding to the new encoding
                # don't use new encoding for first sortkey
                datatype = descr[col][1]

                # use az64 coding if supported. ANALYZE COMPRESSION does not yet support this but it is recommended by AWS
                # see https://docs.amazonaws.cn/en_us/redshift/latest/dg/c_Compression_encodings.html for supported types
                new_encoding = 'az64' if datatype in ['integer', 'smallint', 'integer', 'bigint', 'decimal', 'date',
                                                      'timestamp without time zone', 'timestamp with time zone'] else \
                row[2]
                new_encoding = new_encoding if not abs(
                    row_sortkey) == 1 else 'raw'
                old_encoding = descr[col][2]
                old_encoding = 'raw' if old_encoding == 'none' else old_encoding
                if new_encoding != old_encoding:
                    encodings_modified = True
                    count_optimized += 1

                # fix datatypes from the description type to the create type
                col_type = descr[col][1].replace('character varying',
                                                 'varchar').replace(
                                                     'without time zone', '')

                # check whether columns are too wide
                if analyze_col_width and ("varchar" in col_type
                                          or "int" in col_type):
                    new_col_type = reduce_column_length(
                        col_type, descr[col][0], table_name)
                    if new_col_type != col_type:
                        col_type = new_col_type
                        encodings_modified = True

                # link in the existing distribution key, or set the new one
                row_distkey = descr[col][3]
                if table_name is not None and new_dist_key is not None:
                    if col == new_dist_key:
                        distkey = 'DISTKEY'
                        dist_style = 'KEY'
                        table_distkey = col
                    else:
                        distkey = ''
                else:
                    if str(row_distkey).upper()[0] == 'T':
                        distkey = 'DISTKEY'
                        dist_style = 'KEY'
                        table_distkey = col
                    else:
                        distkey = ''

                # link in the existing sort keys, or set the new ones
                if table_name is not None and len(new_sortkey_arr) > 0:
                    if col in new_sortkey_arr:
                        sortkeys[new_sortkey_arr.index(col) + 1] = col
                        table_sortkeys.append(col)
                else:
                    if row_sortkey != 0:
                        # add the absolute ordering of the sortkey to the list of all sortkeys
                        sortkeys[abs(row_sortkey)] = col
                        table_sortkeys.append(col)

                        if row_sortkey < 0:
                            has_zindex_sortkeys = True

                # don't compress first sort key column. This will be set on the basis of the existing sort key not
                # being modified, or on the assignment of the new first sortkey
                if (abs(row_sortkey) == 1 and len(new_sortkey_arr)
                        == 0) or (col in table_sortkeys
                                  and table_sortkeys.index(col) == 0):
                    compression = 'RAW'
                else:
                    compression = new_encoding

                # extract null/not null setting
                col_null = descr[col][5]

                if str(col_null).upper() == 'TRUE':
                    col_null = 'NOT NULL'
                else:
                    col_null = ''

                # get default or identity syntax for this column
                default_or_identity = descr[col][6]
                if default_or_identity:
                    ident_data = get_identity(default_or_identity)
                    if ident_data is None:
                        default_value = 'default %s' % default_or_identity
                        non_identity_columns.append('"%s"' % col)
                    else:
                        default_value = 'identity (%s, %s)' % ident_data
                        has_identity = True
                else:
                    default_value = ''
                    non_identity_columns.append('"%s"' % col)

                if debug:
                    comment("Column %s will be encoded as %s (previous %s)" %
                            (col, compression, old_encoding))

                # add the formatted column specification
                encode_columns.extend([
                    '"%s" %s %s %s encode %s %s' %
                    (col, col_type, default_value, col_null, compression,
                     distkey)
                ])

            # abort if a new distkey was set but we couldn't find it in the set of all columns
            if new_dist_key is not None and table_distkey is None:
                msg = "Column '%s' not found when setting new Table Distribution Key" % new_dist_key
                comment(msg)
                raise Exception(msg)

            # abort if new sortkeys were set but we couldn't find them in the set of all columns
            if new_sort_keys is not None and len(table_sortkeys) != len(
                    new_sortkey_arr):
                if debug:
                    comment("Requested Sort Keys: %s" % new_sortkey_arr)
                    comment("Resolved Sort Keys: %s" % table_sortkeys)
                msg = "Column resolution of sortkeys '%s' not found when setting new Table Sort Keys" % new_sortkey_arr
                comment(msg)
                raise Exception(msg)

            # if this table's encodings have not changed, then don't do a modification, unless force options is set
            if (not force) and (not encodings_modified):
                comment(
                    "Column Encoding resulted in an identical table - no changes will be made"
                )
            else:
                comment("Column Encoding will be modified for %s.%s" %
                        (schema_name, table_name))

                # add all the column encoding statements on to the create table statement, suppressing the leading
                # comma on the first one
                for i, s in enumerate(encode_columns):
                    create_table += '\n%s%s' % ('' if i == 0 else ',', s)

                create_table = create_table + '\n)\n'

                # add diststyle all if needed
                if dist_style == 'ALL':
                    create_table = create_table + 'diststyle all\n'

                # add sort key as a table block to accommodate multiple columns
                if len(sortkeys) > 0:
                    if debug:
                        comment("Adding Sortkeys: %s" % sortkeys)
                    sortkey = '%sSORTKEY(' % ('INTERLEAVED '
                                              if has_zindex_sortkeys else '')

                    for i in range(1, len(sortkeys) + 1):
                        sortkey = sortkey + sortkeys[i]

                        if i != len(sortkeys):
                            sortkey = sortkey + ','
                        else:
                            sortkey = sortkey + ')\n'
                    create_table = create_table + (' %s ' % sortkey)

                create_table = create_table + ';'

                # run the create table statement
                statements.extend([create_table])

                # get the primary key statement
                statements.extend([
                    get_primary_key(schema_name, set_target_schema, table_name,
                                    target_table)
                ])

                # set the table owner
                statements.extend([
                    'alter table %s."%s" owner to %s;' %
                    (set_target_schema, target_table, owner)
                ])

                if table_comment is not None:
                    statements.extend([
                        'comment on table %s."%s" is \'%s\';' %
                        (set_target_schema, target_table, table_comment)
                    ])

                # insert the old data into the new table
                # if we have identity column(s), we can't insert data from them, so do selective insert
                if has_identity:
                    source_columns = ', '.join(non_identity_columns)
                    mig_columns = '(' + source_columns + ')'
                else:
                    source_columns = '*'
                    mig_columns = ''

                insert = 'insert into %s."%s" %s select %s from %s."%s"' % (
                    set_target_schema, target_table, mig_columns,
                    source_columns, schema_name, table_name)
                if len(table_sortkeys) > 0:
                    insert = "%s order by \"%s\";" % (
                        insert, ",".join(table_sortkeys).replace(',', '\",\"'))
                else:
                    insert = "%s;" % (insert)

                statements.extend([insert])

                # analyze the new table
                analyze = 'analyze %s."%s";' % (set_target_schema,
                                                target_table)
                statements.extend([analyze])

                if set_target_schema == schema_name:
                    # rename the old table to _$old or drop
                    if drop_old_data:
                        drop = 'drop table %s."%s" cascade;' % (
                            set_target_schema, table_name)
                    else:
                        # the alter table statement for the current data will use the first 104 characters of the
                        # original table name, the current datetime as YYYYMMDD and a 10 digit random string
                        drop = 'alter table %s."%s" rename to "%s_%s_%s_$old";' % (
                            set_target_schema, table_name, table_name[0:104],
                            datetime.date.today().strftime("%Y%m%d"),
                            shortuuid.ShortUUID().random(length=10))

                    statements.extend([drop])

                    # rename the migrate table to the old table name
                    rename = 'alter table %s."%s" rename to "%s";' % (
                        set_target_schema, target_table, table_name)
                    statements.extend([rename])

                # add foreign keys
                fks = get_foreign_keys(schema_name, set_target_schema,
                                       table_name)

                # add grants back
                grants = get_grants(schema_name, table_name, db_user)
                if grants is not None:
                    statements.extend(grants)

                statements.extend(['commit;'])

                if do_execute:
                    if not run_commands(get_pg_conn(), statements):
                        if not ignore_errors:
                            if debug:
                                print("Error running statements: %s" %
                                      (str(statements), ))
                            return ERROR

                    # emit a cloudwatch metric for the table
                    if cw is not None:
                        dimensions = [{
                            'Name': 'ClusterIdentifier',
                            'Value': db_host.split('.')[0]
                        }, {
                            'Name': 'TableName',
                            'Value': table_name
                        }]
                        aws_utils.put_metric(cw, 'Redshift',
                                             'ColumnEncodingModification',
                                             dimensions, None, 1, 'Count')
                        if debug:
                            comment(
                                "Emitted Cloudwatch Metric for Column Encoded table"
                            )
                else:
                    comment("No encoding modifications run for %s.%s" %
                            (schema_name, table_name))
        except Exception as e:
            print('Exception %s during analysis of %s' % (e, table_name))
            print(traceback.format_exc())
            return ERROR

        print_statements(statements)

        return (OK, fks, encodings_modified)