def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format( namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures))
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def rs_check_table_rows(psql, db, tables, schemaname): """ make sure that at least 1 row is available in each table """ template = "select count(*) from {0}" for table in tables: namespaced_table = get_namespaced_tablename(table, schemaname=schemaname) query = template.format(namespaced_table) result = psql.run_sql(query, db, query, output=True) if result is False: raise ValueError("Error occurred, see {0} \ scribe log".format(psql.log_stream)) if result[0][0] <= 0: raise ValueError("{0}: has zero rows".format(namespaced_table))
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures) )
def get_min_max_date(psql, db_name, table, column): """ Determine oldest, freshest data in a table Args: psql -- handle to talk to redshift db -- redshift database containing table table -- table name column -- timestamp column name found via get_timestamp_column_name Return: min_date, max_date """ query = QUERY_GET_MIN_MAX_DATE.format(get_namespaced_tablename(table), column) result = psql.run_sql(query, db_name, "get min,max date", output=True) return result[0][0], result[0][1]
def get_min_max_date(psql, db_name, table, column): """ Determine oldest, freshest data in a table Args: psql -- handle to talk to redshift db -- redshift database containing table table -- table name column -- timestamp column name found via get_timestamp_column_name Return: min_date, max_date """ query = QUERY_GET_MIN_MAX_DATE.format( get_namespaced_tablename(table), column ) result = psql.run_sql(query, db_name, "get min,max date", output=True) return result[0][0], result[0][1]
def delete_old_data(psql, db_name, table, ttl_days): """ Delete data older than TTL. Round-down min_date, max_date to 00:00:00 UTC for cutoff calculation. Args: psql -- handle to talk to redshift db -- redshift database containing table table -- table name ttl_days -- max TTL of data in a table Return: None """ cname = get_timestamp_column_name(psql, db_name, table) if cname is None: return 0 dt_min_date, dt_max_date = get_min_max_date(psql, db_name, table, cname) if dt_min_date is None or dt_max_date is None: return 0 # cutoff is always YYYY-MM-DD 00:00:00 dt_min = datetime(dt_min_date.year, dt_min_date.month, dt_min_date.day) dt_max = datetime(dt_max_date.year, dt_max_date.month, dt_max_date.day) num_days = (dt_max - dt_min).days num_deleted = 0 if ttl_days is not None and num_days > ttl_days: dt_new_min_date = dt_min + timedelta(days=num_days - ttl_days) new_min_date = datetime.strftime(dt_new_min_date, "%Y-%m-%d %H:%M:%S") query = QUERY_DELETE_ROWS_BY_DATE.format( get_namespaced_tablename(table), cname ) params = {'new_min_date': new_min_date} result = psql.run_sql_ex(query, db_name, "delete rows", params=params) if result is not False: match = re.search(r"^DELETE\s+(?P<num_deleted>\d+)$", result.get('status', '')) num_deleted = int(match.group('num_deleted')) if match else 0 if num_deleted <= 0: raise ValueError("nothing to delete for {0}".format(table)) return num_deleted
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream): s3_log, rs_table = log_tuple namespaced_table_name = get_namespaced_tablename(rs_table) table_start = time.time() extra_msg = "from s3 log: {0}".format(s3_log) logstream.write_msg('starting', extra_msg=extra_msg) # about to load new day, remove oldest rows_deleted = None if ttl_days is not None: rows_deleted = \ delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1) if rows_deleted: logstream.write_msg('delete_ok', extra_msg="{0} rows".format(rows_deleted)) # Try to reclaim disk space. If not needed, it will be fast. # Calling here and not in the 'if rows_deleted' code to prevent # scenario where rows were deleted but compact failed. Then on retry # there will be nothing to delete but since space is not reclaimed # there may not be enough for a new load, resulting in failure forever. if ttl_days is not None: compact_table(psql_helper, db_name, namespaced_table_name) delimiter = read_string('redshift_column_delimiter') delimiter = delimiter.decode("string_escape") if delimiter not in string.printable: delimiter = '\\' + oct(ord(delimiter)) copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter) result = psql_helper.run_sql( copy_sql, db_name, " copying from " + s3_log, s3_needed=True, time_est_secs=read_int('pipeline.load_step.copy_time_est_secs')) if result is not False: logstream.write_msg('complete', job_start_secs=table_start, extra_msg=extra_msg) return result
def delete_old_data(psql, db_name, table, ttl_days): """ Delete data older than TTL. Round-down min_date, max_date to 00:00:00 UTC for cutoff calculation. Args: psql -- handle to talk to redshift db -- redshift database containing table table -- table name ttl_days -- max TTL of data in a table Return: None """ cname = get_timestamp_column_name(psql, db_name, table) if cname is None: return 0 dt_min_date, dt_max_date = get_min_max_date(psql, db_name, table, cname) if dt_min_date is None or dt_max_date is None: return 0 # cutoff is always YYYY-MM-DD 00:00:00 dt_min = datetime(dt_min_date.year, dt_min_date.month, dt_min_date.day) dt_max = datetime(dt_max_date.year, dt_max_date.month, dt_max_date.day) num_days = (dt_max - dt_min).days num_deleted = 0 if ttl_days is not None and num_days > ttl_days: dt_new_min_date = dt_min + timedelta(days=num_days - ttl_days) new_min_date = datetime.strftime(dt_new_min_date, "%Y-%m-%d %H:%M:%S") query = QUERY_DELETE_ROWS_BY_DATE.format( get_namespaced_tablename(table), cname) params = {'new_min_date': new_min_date} result = psql.run_sql_ex(query, db_name, "delete rows", params=params) if result is not False: match = re.search(r"^DELETE\s+(?P<num_deleted>\d+)$", result.get('status', '')) num_deleted = int(match.group('num_deleted')) if match else 0 if num_deleted <= 0: raise ValueError("nothing to delete for {0}".format(table)) return num_deleted
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream): s3_log, rs_table = log_tuple namespaced_table_name = get_namespaced_tablename(rs_table) table_start = time.time() extra_msg = "from s3 log: {0}".format(s3_log) logstream.write_msg('starting', extra_msg=extra_msg) # about to load new day, remove oldest rows_deleted = None if ttl_days is not None: rows_deleted = \ delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1) if rows_deleted: logstream.write_msg('delete_ok', extra_msg="{0} rows".format(rows_deleted)) # Try to reclaim disk space. If not needed, it will be fast. # Calling here and not in the 'if rows_deleted' code to prevent # scenario where rows were deleted but compact failed. Then on retry # there will be nothing to delete but since space is not reclaimed # there may not be enough for a new load, resulting in failure forever. if ttl_days is not None: compact_table(psql_helper, db_name, namespaced_table_name) delimiter = read_string('redshift_column_delimiter') delimiter = delimiter.decode("string_escape") if delimiter not in string.printable: delimiter = '\\' + oct(ord(delimiter)) copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter) result = psql_helper.run_sql( copy_sql, db_name, " copying from " + s3_log, s3_needed=True, time_est_secs=read_int('pipeline.load_step.copy_time_est_secs') ) if result is not False: logstream.write_msg('complete', job_start_secs=table_start, extra_msg=extra_msg) return result
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples
def add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream): """ Add new columns to existing table. Copy data into temp table to detect encoding. Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD table -- table name where to add columns -- not namespaced to_add -- list of columns to add tbl_tuple -- a tuple containing path to table in PSV format and a Redshift temporary table where to load data logstream -- a PipelineStreamLogger Return: None """ _, tmp_tbl_name = tbl_tuple for row in to_add: if row[PgTableDef.SortKey] or row[PgTableDef.DistKey]: raise ValueError("{0}: {1} new column is a sortkey \ or distkey".format(table, row[PgTableDef.Column])) if to_add: # copy data into tmp_tbl_name in order to detect encoding copy_table(psql, db, ddate, tbl_tuple, MAX_TTL_DAYS, logstream) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) for row in tmp_tbl_def[len(tmp_tbl_def) - len(to_add):]: encoding = row[PgTableDef.Encoding] query = QUERY_ADD_COLUMN.format( get_namespaced_tablename(table), row[PgTableDef.Column], row[PgTableDef.Type], "raw" if encoding == "none" else encoding, "not null" if row[PgTableDef.NotNull] else "null", defaults[row[PgTableDef.Column]] ) psql.run_sql(query, db, query)
def add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream): """ Add new columns to existing table. Copy data into temp table to detect encoding. Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD table -- table name where to add columns -- not namespaced to_add -- list of columns to add tbl_tuple -- a tuple containing path to table in PSV format and a Redshift temporary table where to load data logstream -- a PipelineStreamLogger Return: None """ _, tmp_tbl_name = tbl_tuple for row in to_add: if row[PgTableDef.SortKey] or row[PgTableDef.DistKey]: raise ValueError("{0}: {1} new column is a sortkey \ or distkey".format(table, row[PgTableDef.Column])) if to_add: # copy data into tmp_tbl_name in order to detect encoding copy_table(psql, db, ddate, tbl_tuple, MAX_TTL_DAYS, logstream) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) for row in tmp_tbl_def[len(tmp_tbl_def) - len(to_add):]: encoding = row[PgTableDef.Encoding] query = QUERY_ADD_COLUMN.format( get_namespaced_tablename(table), row[PgTableDef.Column], row[PgTableDef.Type], "raw" if encoding == "none" else encoding, "not null" if row[PgTableDef.NotNull] else "null", defaults[row[PgTableDef.Column]]) psql.run_sql(query, db, query)
def compact_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) compact_table(psql, db, tbl_name, run_now=True)
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only( exc_type, exc_value )[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate ) status_helper.update_status( db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg ) handle_error(error_msg, logstream) status_helper.update_status( db_name, ddate, yaml_versions, "complete", start_time_secs=start )
def test_get_namespaced_tablename_config(input_config, expected_out): filepath = os.path.join('tests', 'common', input_config) YamlConfiguration(filepath) output_under_test = get_namespaced_tablename("table_name_blah") assert output_under_test == expected_out
def test_get_namespaced_tablename_arg(input_schema, expected_out): output_under_test = get_namespaced_tablename( "table_name_blah", schemaname=input_schema ) assert output_under_test == expected_out
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only(exc_type, exc_value)[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate) status_helper.update_status(db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg) handle_error(error_msg, logstream) status_helper.update_status(db_name, ddate, yaml_versions, "complete", start_time_secs=start)
def test_get_namespaced_tablename_arg(input_schema, expected_out): output_under_test = get_namespaced_tablename("table_name_blah", schemaname=input_schema) assert output_under_test == expected_out