def test_load_from_file_with_no_local_file(): file_path = 'http://foo' expected_exception_args = (2, 'No such file or directory') try: load_from_file(file_path) assert 0 except Exception as e: assert e.args == expected_exception_args
def mapper_init(self): """ mrjob initialization. Should you decide to override, you should call 'super' to invoke this method. """ yaml_data = load_from_file(self.options.extractions) schema = RedShiftLogSchema(yaml.load(yaml_data)) self.schema = schema self.table_name_to_columns = dict((table_name, [ Column.create_from_table(table, column) for column in table['columns'] ]) for table_name, table in schema.tables().iteritems()) self.table_name_to_table = dict( (table_name, Table.create(table, columns=self.table_name_to_columns[table_name])) for table_name, table in schema.tables().iteritems()) self.table_name_to_output_order = dict( (table_name, [column.name for column in columns if not column.is_noop]) for table_name, columns in self.table_name_to_columns.iteritems()) self.redshift_export = RedshiftExportProtocol( delimiter=self.options.column_delimiter) error_table_name, error_table = self.schema.get_error_table() self.error_tbl_name = error_table_name self.error_tbl_output_order = [ c['log_key'] for c in error_table['columns'] ]
def mapper_init(self): """ mrjob initialization. Should you decide to override, you should call 'super' to invoke this method. """ yaml_data = load_from_file(self.options.extractions) schema = RedShiftLogSchema(yaml.load(yaml_data)) self.schema = schema self.table_name_to_columns = dict( (table_name, [Column.create_from_table(table, column) for column in table['columns']]) for table_name, table in schema.tables().iteritems() ) self.table_name_to_table = dict( (table_name, Table.create(table, columns=self.table_name_to_columns[table_name])) for table_name, table in schema.tables().iteritems() ) self.table_name_to_output_order = dict( (table_name, [column.name for column in columns if not column.is_noop]) for table_name, columns in self.table_name_to_columns.iteritems() ) self.redshift_export = RedshiftExportProtocol( delimiter=self.options.column_delimiter ) error_table_name, error_table = self.schema.get_error_table() self.error_tbl_name = error_table_name self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]
def get_yaml_table_versions(yaml_file): """ get_yaml_table_versions puts together a string of table versions for a particular yaml file Args: yaml_file -- the file with table versions specified Returns: a string of the form "<tablename>: <versionnumber", ..." """ yaml_data = load_from_file(yaml_file) yaml_dict = yaml.load(yaml_data) version_list = [] if 'version' in yaml_dict: for table_name in yaml_dict['tables'].keys(): version_list.append(str(table_name)) version_list.sort() version_list.append(str(yaml_dict['version'])) else: for table_name in yaml_dict.keys(): version_list.append("{0}: {1}".format( table_name, yaml_dict[table_name]['version'])) version_list.sort() return " ".join(version_list)
def get_yaml_table_versions(yaml_file): """ get_yaml_table_versions puts together a string of table versions for a particular yaml file Args: yaml_file -- the file with table versions specified Returns: a string of the form "<tablename>: <versionnumber", ..." """ yaml_data = load_from_file(yaml_file) yaml_dict = yaml.load(yaml_data) version_list = [] if 'version' in yaml_dict: for table_name in yaml_dict['tables'].keys(): version_list.append(str(table_name)) version_list.sort() version_list.append(str(yaml_dict['version'])) else: for table_name in yaml_dict.keys(): version_list.append( "{0}: {1}".format(table_name, yaml_dict[table_name]['version']) ) version_list.sort() return " ".join(version_list)
def test_load_from_file_with_s3_file(): file_path = 's3://blah-foo-us-west-2/foo' file_content = 'stuff' with patch('sherlock.common.util.load_from_s3_file', autospec=True) as mock_load: mock_load.return_value = file_content result = load_from_file(file_path) assert result == file_content
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format( namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def test_load_from_file_with_local_file(): file_path = './foo' file_content = 'stuff' with patch('__builtin__.open', mock_open(read_data=file_content), create=True) as m: result = load_from_file(file_path) m.assert_called_once_with(file_path, 'r') m().read.assert_called_once_with() assert result == file_content
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def create_schema(file_path): yaml_data = load_from_file(file_path) schema = RedShiftLogSchema(yaml.load(yaml_data)) name_to_columns = dict((name, [ Column.create_from_table(table, column) for column in table['columns'] ]) for name, table in schema.tables().iteritems()) for __, columns in name_to_columns.iteritems(): assert columns name_to_table = dict( (name, Table.create(table, columns=name_to_columns[name])) for name, table in schema.tables().iteritems()) assert name_to_table
def rs_check_schema(rs_mgmt, args): yaml_data = load_from_file(args.schema) tables = RedShiftLogSchema(safe_load(yaml_data)).tables() db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') pipe_strm_lgr = PipelineStreamLogger( log_stream, True, 'rs_check_schema' ) psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True) rs_check_table_def(psql, db, tables, args.redshift_schema) rs_check_table_rows(psql, db, tables, args.redshift_schema)
def create_schema(file_path): yaml_data = load_from_file(file_path) schema = RedShiftLogSchema(yaml.load(yaml_data)) name_to_columns = dict((name, [Column.create_from_table(table, column) for column in table['columns']]) for name, table in schema.tables().iteritems()) for __, columns in name_to_columns.iteritems(): assert columns name_to_table = dict((name, Table.create(table, columns=name_to_columns[name])) for name, table in schema.tables().iteritems()) assert name_to_table
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures)) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures) ) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)