Example #1
0
def test_load_from_file_with_no_local_file():
    file_path = 'http://foo'
    expected_exception_args = (2, 'No such file or directory')
    try:
        load_from_file(file_path)
        assert 0
    except Exception as e:
        assert e.args == expected_exception_args
Example #2
0
def test_load_from_file_with_no_local_file():
    file_path = 'http://foo'
    expected_exception_args = (2, 'No such file or directory')
    try:
        load_from_file(file_path)
        assert 0
    except Exception as e:
        assert e.args == expected_exception_args
Example #3
0
    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict((table_name, [
            Column.create_from_table(table, column)
            for column in table['columns']
        ]) for table_name, table in schema.tables().iteritems())
        self.table_name_to_table = dict(
            (table_name,
             Table.create(table,
                          columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems())
        self.table_name_to_output_order = dict(
            (table_name,
             [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems())
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter)

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [
            c['log_key'] for c in error_table['columns']
        ]
Example #4
0
    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict(
            (table_name,
                [Column.create_from_table(table, column)
                    for column in table['columns']])
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_table = dict(
            (table_name,
                Table.create(table,
                             columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_output_order = dict(
            (table_name,
                [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems()
        )
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter
        )

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]
Example #5
0
def get_yaml_table_versions(yaml_file):
    """
    get_yaml_table_versions puts together a string of table versions for a
    particular yaml file

    Args:
    yaml_file -- the file with table versions specified

    Returns:
    a string of the form "<tablename>: <versionnumber", ..."
    """
    yaml_data = load_from_file(yaml_file)
    yaml_dict = yaml.load(yaml_data)

    version_list = []
    if 'version' in yaml_dict:
        for table_name in yaml_dict['tables'].keys():
            version_list.append(str(table_name))
        version_list.sort()
        version_list.append(str(yaml_dict['version']))
    else:
        for table_name in yaml_dict.keys():
            version_list.append("{0}: {1}".format(
                table_name, yaml_dict[table_name]['version']))
        version_list.sort()

    return " ".join(version_list)
Example #6
0
def get_yaml_table_versions(yaml_file):
    """
    get_yaml_table_versions puts together a string of table versions for a
    particular yaml file

    Args:
    yaml_file -- the file with table versions specified

    Returns:
    a string of the form "<tablename>: <versionnumber", ..."
    """
    yaml_data = load_from_file(yaml_file)
    yaml_dict = yaml.load(yaml_data)

    version_list = []
    if 'version' in yaml_dict:
        for table_name in yaml_dict['tables'].keys():
            version_list.append(str(table_name))
        version_list.sort()
        version_list.append(str(yaml_dict['version']))
    else:
        for table_name in yaml_dict.keys():
            version_list.append(
                "{0}: {1}".format(table_name, yaml_dict[table_name]['version'])
            )
        version_list.sort()

    return " ".join(version_list)
Example #7
0
def test_load_from_file_with_s3_file():
    file_path = 's3://blah-foo-us-west-2/foo'
    file_content = 'stuff'

    with patch('sherlock.common.util.load_from_s3_file',
               autospec=True) as mock_load:
        mock_load.return_value = file_content
        result = load_from_file(file_path)
        assert result == file_content
Example #8
0
def test_load_from_file_with_s3_file():
    file_path = 's3://blah-foo-us-west-2/foo'
    file_content = 'stuff'

    with patch('sherlock.common.util.load_from_s3_file',
               autospec=True) as mock_load:
        mock_load.return_value = file_content
        result = load_from_file(file_path)
        assert result == file_content
Example #9
0
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table,
                                                   tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults,
                        logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(
                    namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)
Example #10
0
def test_load_from_file_with_local_file():
    file_path = './foo'
    file_content = 'stuff'

    with patch('__builtin__.open',
               mock_open(read_data=file_content),
               create=True) as m:
        result = load_from_file(file_path)
    m.assert_called_once_with(file_path, 'r')
    m().read.assert_called_once_with()
    assert result == file_content
Example #11
0
def test_load_from_file_with_local_file():
    file_path = './foo'
    file_content = 'stuff'

    with patch('__builtin__.open',
               mock_open(read_data=file_content),
               create=True) as m:
        result = load_from_file(file_path)
    m.assert_called_once_with(file_path, 'r')
    m().read.assert_called_once_with()
    assert result == file_content
Example #12
0
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add,
                        tbl_tuple, defaults, logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)
Example #13
0
def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [
        Column.create_from_table(table, column) for column in table['columns']
    ]) for name, table in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict(
        (name, Table.create(table, columns=name_to_columns[name]))
        for name, table in schema.tables().iteritems())
    assert name_to_table
Example #14
0
def rs_check_schema(rs_mgmt, args):
    yaml_data = load_from_file(args.schema)
    tables = RedShiftLogSchema(safe_load(yaml_data)).tables()

    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    pipe_strm_lgr = PipelineStreamLogger(
        log_stream,
        True,
        'rs_check_schema'
    )
    psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True)
    rs_check_table_def(psql, db, tables, args.redshift_schema)
    rs_check_table_rows(psql, db, tables, args.redshift_schema)
Example #15
0
def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [Column.create_from_table(table, column)
                                   for column in table['columns']])
                           for name, table
                           in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict((name,
                          Table.create(table,
                                       columns=name_to_columns[name]))
                         for name, table in schema.tables().iteritems())
    assert name_to_table
Example #16
0
def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples
Example #17
0
def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples
Example #18
0

def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures))


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)
Example #19
0
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures)
        )


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)