def mapper_init(self): """ mrjob initialization. Should you decide to override, you should call 'super' to invoke this method. """ yaml_data = load_from_file(self.options.extractions) schema = RedShiftLogSchema(yaml.load(yaml_data)) self.schema = schema self.table_name_to_columns = dict( (table_name, [Column.create_from_table(table, column) for column in table['columns']]) for table_name, table in schema.tables().iteritems() ) self.table_name_to_table = dict( (table_name, Table.create(table, columns=self.table_name_to_columns[table_name])) for table_name, table in schema.tables().iteritems() ) self.table_name_to_output_order = dict( (table_name, [column.name for column in columns if not column.is_noop]) for table_name, columns in self.table_name_to_columns.iteritems() ) self.redshift_export = RedshiftExportProtocol( delimiter=self.options.column_delimiter ) error_table_name, error_table = self.schema.get_error_table() self.error_tbl_name = error_table_name self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]
def mapper_init(self): """ mrjob initialization. Should you decide to override, you should call 'super' to invoke this method. """ yaml_data = load_from_file(self.options.extractions) schema = RedShiftLogSchema(yaml.load(yaml_data)) self.schema = schema self.table_name_to_columns = dict((table_name, [ Column.create_from_table(table, column) for column in table['columns'] ]) for table_name, table in schema.tables().iteritems()) self.table_name_to_table = dict( (table_name, Table.create(table, columns=self.table_name_to_columns[table_name])) for table_name, table in schema.tables().iteritems()) self.table_name_to_output_order = dict( (table_name, [column.name for column in columns if not column.is_noop]) for table_name, columns in self.table_name_to_columns.iteritems()) self.redshift_export = RedshiftExportProtocol( delimiter=self.options.column_delimiter) error_table_name, error_table = self.schema.get_error_table() self.error_tbl_name = error_table_name self.error_tbl_output_order = [ c['log_key'] for c in error_table['columns'] ]
def create_new_yaml(suffix, input_file=sys.stdin): schema = RedShiftLogSchema(yaml.load(input_file)) for table_name in schema.tables().keys(): schema.table_rename( table_name, '{original_name}{suffix}'.format( original_name=table_name, suffix=suffix ) ) return yaml.dump(schema.schema(), default_flow_style=False)
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def create_schema(file_path): yaml_data = load_from_file(file_path) schema = RedShiftLogSchema(yaml.load(yaml_data)) name_to_columns = dict((name, [ Column.create_from_table(table, column) for column in table['columns'] ]) for name, table in schema.tables().iteritems()) for __, columns in name_to_columns.iteritems(): assert columns name_to_table = dict( (name, Table.create(table, columns=name_to_columns[name])) for name, table in schema.tables().iteritems()) assert name_to_table
def create_schema(file_path): yaml_data = load_from_file(file_path) schema = RedShiftLogSchema(yaml.load(yaml_data)) name_to_columns = dict((name, [Column.create_from_table(table, column) for column in table['columns']]) for name, table in schema.tables().iteritems()) for __, columns in name_to_columns.iteritems(): assert columns name_to_table = dict((name, Table.create(table, columns=name_to_columns[name])) for name, table in schema.tables().iteritems()) assert name_to_table
def mk_table(db, table_name, src='', src_type='dict', sortkeys=None, flattenNestedKeys=True, add_source_filename=False): """Create a table definition from db table_name: see RedshiftSchema.table_create src: see RedshiftSchema.table_create src_type: see RedshiftSchema.table_create sortkeys: see RedshiftSchema.table_create flattenNestedKeys: Allow column names of the form 'location.bounds.max' or change to location_bounds_max instead. add_source_filename: see RedshiftSchema.table_create """ rs_schema = RedShiftLogSchema() rs_schema.table_create(table_name, src, src_type, sortkeys, add_source_filename) columns = [] # sort for consistent output for key in sorted(db.keys()): val = db[key] log_key = key sql_attr = RedShiftSchemaMaker.type_to_sqlattr( val['type'], val['max_len']) if sql_attr is None: continue is_json = val.get('is_json', False) is_foreign = val.get('is_foreign', False) # nested log_key are a pain in RedShift. # Ex. select results."location.bounds" # replace '.' with '_' instead. name = re.sub('\.', '_', key) if flattenNestedKeys is True else key if val.get('is_mandatory', False) is True: sql_attr += ' not null' if is_foreign: rs_schema.column_add(table_name, name, sql_attr, log_key, is_json, is_foreign) else: columns.append( [table_name, name, sql_attr, log_key, is_json, False]) for args in columns: rs_schema.column_add(*args) return rs_schema.tables()
def mk_table(db, table_name, src='', src_type='dict', sortkeys=None, flattenNestedKeys=True, add_source_filename=False): """Create a table definition from db table_name: see RedshiftSchema.table_create src: see RedshiftSchema.table_create src_type: see RedshiftSchema.table_create sortkeys: see RedshiftSchema.table_create flattenNestedKeys: Allow column names of the form 'location.bounds.max' or change to location_bounds_max instead. add_source_filename: see RedshiftSchema.table_create """ rs_schema = RedShiftLogSchema() rs_schema.table_create( table_name, src, src_type, sortkeys, add_source_filename ) columns = [] # sort for consistent output for key in sorted(db.keys()): val = db[key] log_key = key sql_attr = RedShiftSchemaMaker.type_to_sqlattr( val['type'], val['max_len'] ) if sql_attr is None: continue is_json = val.get('is_json', False) is_foreign = val.get('is_foreign', False) # nested log_key are a pain in RedShift. # Ex. select results."location.bounds" # replace '.' with '_' instead. name = re.sub('\.', '_', key) if flattenNestedKeys is True else key if val.get('is_mandatory', False) is True: sql_attr += ' not null' if is_foreign: rs_schema.column_add( table_name, name, sql_attr, log_key, is_json, is_foreign ) else: columns.append( [table_name, name, sql_attr, log_key, is_json, False] ) for args in columns: rs_schema.column_add(*args) return rs_schema.tables()
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples
def rs_check_schema(rs_mgmt, args): yaml_data = load_from_file(args.schema) tables = RedShiftLogSchema(safe_load(yaml_data)).tables() db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') pipe_strm_lgr = PipelineStreamLogger( log_stream, True, 'rs_check_schema' ) psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True) rs_check_table_def(psql, db, tables, args.redshift_schema) rs_check_table_rows(psql, db, tables, args.redshift_schema)
def create_new_yaml(suffix, input_file=sys.stdin): schema = RedShiftLogSchema(yaml.load(input_file)) for table_name in schema.tables().keys(): schema.table_rename( table_name, '{original_name}{suffix}'.format(original_name=table_name, suffix=suffix)) return yaml.dump(schema.schema(), default_flow_style=False)
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format( namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples
def main(): schema = RedShiftLogSchema(safe_load(sys.stdin)) sql_str = tables_to_sql(schema.tables()) sys.stdout.write(sql_str)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures) ) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only(exc_type, exc_value)[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate) status_helper.update_status(db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg) handle_error(error_msg, logstream) status_helper.update_status(db_name, ddate, yaml_versions, "complete", start_time_secs=start)
parser = argparse.ArgumentParser( description=""" This is a tool to maniuplate Redshift schema without manually editing the yaml defintion. The program takes its input from standard input. Example Usage: cat schema/db.yaml | %(prog)s table_create -s '' --source-type DICT """, formatter_class=argparse.RawTextHelpFormatter, ) subparsers = parser.add_subparsers() set_table_create_parser(subparsers) set_table_delete_parser(subparsers) set_column_add_parser(subparsers) set_column_remove_parser(subparsers) set_sortkeys_add_parser(subparsers) args = parser.parse_args() return args if __name__ == "__main__": args = get_cmd_line_args() schema = RedShiftLogSchema(yaml.load(sys.stdin.read()) or dict()) version = schema.version_get() args.func(schema, args) schema.version_set(version + 1) sys.stdout.write(RedShiftLogSchema.header()) sys.stdout.write(yaml.dump(schema.schema(), default_flow_style=False))
--suffix '_test_5' """ import argparse import sys import yaml from sherlock.common.redshift_schema import RedShiftLogSchema def create_new_yaml(suffix, input_file=sys.stdin): schema = RedShiftLogSchema(yaml.load(input_file)) for table_name in schema.tables().keys(): schema.table_rename( table_name, '{original_name}{suffix}'.format( original_name=table_name, suffix=suffix ) ) return yaml.dump(schema.schema(), default_flow_style=False) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Rename table with suffix') parser.add_argument('--suffix') args = parser.parse_args() print RedShiftLogSchema.header() print create_new_yaml(args.suffix)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures)) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)
def main(): schema = RedShiftLogSchema(safe_load(sys.stdin)) sql_str = tables_to_sql(schema.tables()) sys.stdout.write(sql_str)
Usage: cat schema/db.yaml | sherlock/tools/table_rename.py \ --suffix '_test_5' """ import argparse import sys import yaml from sherlock.common.redshift_schema import RedShiftLogSchema def create_new_yaml(suffix, input_file=sys.stdin): schema = RedShiftLogSchema(yaml.load(input_file)) for table_name in schema.tables().keys(): schema.table_rename( table_name, '{original_name}{suffix}'.format(original_name=table_name, suffix=suffix)) return yaml.dump(schema.schema(), default_flow_style=False) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Rename table with suffix') parser.add_argument('--suffix') args = parser.parse_args() print RedShiftLogSchema.header() print create_new_yaml(args.suffix)
"--add-source-filename", action='store_true', help="[%(default)s] column indicating source of each table row") parser.add_argument( "--merge-with-schema", metavar='PATH_TO_SCHEMA_FILE', help="path to schema file into which new table is merged") args = parser.parse_args() args.foreign.sort() return args if __name__ == "__main__": args = get_cmd_line_args() schema = RedShiftLogSchema() version = 0 if args.merge_with_schema: with open(args.merge_with_schema, 'r') as yaml_file: schema = RedShiftLogSchema(yaml.safe_load(yaml_file)) if schema.get_table(args.table) is not None: schema.table_delete(args.table) version = schema.version_get() sm = RedShiftSchemaMaker(exclude=args.exclude, prune=args.prune) for line in fileinput.input('-'): process_row(sm, simplejson.loads(line), args) table = sm.mk_table(sm.schema, args.table,
"--add-source-filename", action='store_true', help="[%(default)s] column indicating source of each table row" ) parser.add_argument( "--merge-with-schema", metavar='PATH_TO_SCHEMA_FILE', help="path to schema file into which new table is merged" ) args = parser.parse_args() args.foreign.sort() return args if __name__ == "__main__": args = get_cmd_line_args() schema = RedShiftLogSchema() version = 0 if args.merge_with_schema: with open(args.merge_with_schema, 'r') as yaml_file: schema = RedShiftLogSchema(yaml.safe_load(yaml_file)) if schema.get_table(args.table) is not None: schema.table_delete(args.table) version = schema.version_get() sm = RedShiftSchemaMaker(exclude=args.exclude, prune=args.prune) for line in fileinput.input('-'): process_row(sm, simplejson.loads(line), args) table = sm.mk_table(sm.schema, args.table, args.source, args.source_type, add_source_filename=args.add_source_filename)
parser = argparse.ArgumentParser( description=""" This is a tool to maniuplate Redshift schema without manually editing the yaml defintion. The program takes its input from standard input. Example Usage: cat schema/db.yaml | %(prog)s table_create -s '' --source-type DICT """, formatter_class=argparse.RawTextHelpFormatter, ) subparsers = parser.add_subparsers() set_table_create_parser(subparsers) set_table_delete_parser(subparsers) set_column_add_parser(subparsers) set_column_remove_parser(subparsers) set_sortkeys_add_parser(subparsers) args = parser.parse_args() return args if __name__ == "__main__": args = get_cmd_line_args() schema = RedShiftLogSchema(yaml.load(sys.stdin.read()) or dict()) version = schema.version_get() args.func(schema, args) schema.version_set(version + 1) sys.stdout.write(RedShiftLogSchema.header()) sys.stdout.write(yaml.dump(schema.schema(), default_flow_style=False))