Python RedShiftLogSchema Examples, sherlock.common.redshift_schema.RedShiftLogSchema Python Examples

Example #1

0

Show file

File: mr_redshift_etl.py Project: Yelp/mycroft

    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict(
            (table_name,
                [Column.create_from_table(table, column)
                    for column in table['columns']])
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_table = dict(
            (table_name,
                Table.create(table,
                             columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_output_order = dict(
            (table_name,
                [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems()
        )
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter
        )

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]

Example #2

0

Show file

File: mr_redshift_etl.py Project: wlstyy/mycroft

    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict((table_name, [
            Column.create_from_table(table, column)
            for column in table['columns']
        ]) for table_name, table in schema.tables().iteritems())
        self.table_name_to_table = dict(
            (table_name,
             Table.create(table,
                          columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems())
        self.table_name_to_output_order = dict(
            (table_name,
             [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems())
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter)

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [
            c['log_key'] for c in error_table['columns']
        ]

Example #3

0

Show file

File: table_rename.py Project: Yelp/mycroft

def create_new_yaml(suffix, input_file=sys.stdin):

    schema = RedShiftLogSchema(yaml.load(input_file))
    for table_name in schema.tables().keys():
        schema.table_rename(
            table_name, '{original_name}{suffix}'.format(
                original_name=table_name,
                suffix=suffix
            )
        )
    return yaml.dump(schema.schema(), default_flow_style=False)

Example #4

0

Show file

File: s3_to_redshift.py Project: Yelp/mycroft

def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add,
                        tbl_tuple, defaults, logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)

Example #5

0

Show file

def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [
        Column.create_from_table(table, column) for column in table['columns']
    ]) for name, table in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict(
        (name, Table.create(table, columns=name_to_columns[name]))
        for name, table in schema.tables().iteritems())
    assert name_to_table

Example #6

0

Show file

File: validate_schema.py Project: Yelp/mycroft

def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [Column.create_from_table(table, column)
                                   for column in table['columns']])
                           for name, table
                           in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict((name,
                          Table.create(table,
                                       columns=name_to_columns[name]))
                         for name, table in schema.tables().iteritems())
    assert name_to_table

Example #7

0

Show file

    def mk_table(db,
                 table_name,
                 src='',
                 src_type='dict',
                 sortkeys=None,
                 flattenNestedKeys=True,
                 add_source_filename=False):
        """Create a table definition from db
        table_name: see RedshiftSchema.table_create
        src:        see RedshiftSchema.table_create
        src_type:   see RedshiftSchema.table_create
        sortkeys:   see RedshiftSchema.table_create
        flattenNestedKeys: Allow column names of the form
            'location.bounds.max' or change to location_bounds_max instead.
        add_source_filename:  see RedshiftSchema.table_create
        """
        rs_schema = RedShiftLogSchema()
        rs_schema.table_create(table_name, src, src_type, sortkeys,
                               add_source_filename)
        columns = []

        # sort for consistent output
        for key in sorted(db.keys()):
            val = db[key]
            log_key = key
            sql_attr = RedShiftSchemaMaker.type_to_sqlattr(
                val['type'], val['max_len'])
            if sql_attr is None:
                continue

            is_json = val.get('is_json', False)
            is_foreign = val.get('is_foreign', False)
            # nested log_key are a pain in RedShift.
            # Ex. select results."location.bounds"
            # replace '.' with '_' instead.
            name = re.sub('\.', '_', key) if flattenNestedKeys is True else key

            if val.get('is_mandatory', False) is True:
                sql_attr += ' not null'

            if is_foreign:
                rs_schema.column_add(table_name, name, sql_attr, log_key,
                                     is_json, is_foreign)
            else:
                columns.append(
                    [table_name, name, sql_attr, log_key, is_json, False])
        for args in columns:
            rs_schema.column_add(*args)
        return rs_schema.tables()

Example #8

0

Show file

File: redshift_schema_maker.py Project: Yelp/mycroft

    def mk_table(db, table_name, src='', src_type='dict', sortkeys=None,
                 flattenNestedKeys=True, add_source_filename=False):
        """Create a table definition from db
        table_name: see RedshiftSchema.table_create
        src:        see RedshiftSchema.table_create
        src_type:   see RedshiftSchema.table_create
        sortkeys:   see RedshiftSchema.table_create
        flattenNestedKeys: Allow column names of the form
            'location.bounds.max' or change to location_bounds_max instead.
        add_source_filename:  see RedshiftSchema.table_create
        """
        rs_schema = RedShiftLogSchema()
        rs_schema.table_create(
            table_name,
            src,
            src_type,
            sortkeys,
            add_source_filename
        )
        columns = []

        # sort for consistent output
        for key in sorted(db.keys()):
            val = db[key]
            log_key = key
            sql_attr = RedShiftSchemaMaker.type_to_sqlattr(
                val['type'],
                val['max_len']
            )
            if sql_attr is None:
                continue

            is_json = val.get('is_json', False)
            is_foreign = val.get('is_foreign', False)
            # nested log_key are a pain in RedShift.
            # Ex. select results."location.bounds"
            # replace '.' with '_' instead.
            name = re.sub('\.', '_', key) if flattenNestedKeys is True else key

            if val.get('is_mandatory', False) is True:
                sql_attr += ' not null'

            if is_foreign:
                rs_schema.column_add(
                    table_name, name, sql_attr, log_key, is_json, is_foreign
                )
            else:
                columns.append(
                    [table_name, name, sql_attr, log_key, is_json, False]
                )
        for args in columns:
            rs_schema.column_add(*args)
        return rs_schema.tables()

Example #9

0

Show file

File: s3_to_redshift.py Project: Yelp/mycroft

def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples

Example #10

0

Show file

File: rs_mgmt.py Project: wlstyy/mycroft

def rs_check_schema(rs_mgmt, args):
    yaml_data = load_from_file(args.schema)
    tables = RedShiftLogSchema(safe_load(yaml_data)).tables()

    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    pipe_strm_lgr = PipelineStreamLogger(
        log_stream,
        True,
        'rs_check_schema'
    )
    psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True)
    rs_check_table_def(psql, db, tables, args.redshift_schema)
    rs_check_table_rows(psql, db, tables, args.redshift_schema)

Example #11

0

Show file

def create_new_yaml(suffix, input_file=sys.stdin):

    schema = RedShiftLogSchema(yaml.load(input_file))
    for table_name in schema.tables().keys():
        schema.table_rename(
            table_name,
            '{original_name}{suffix}'.format(original_name=table_name,
                                             suffix=suffix))
    return yaml.dump(schema.schema(), default_flow_style=False)

Example #12

0

Show file

File: s3_to_redshift.py Project: wlstyy/mycroft

def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table,
                                                   tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults,
                        logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(
                    namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)

Example #13

0

Show file

File: s3_to_redshift.py Project: wlstyy/mycroft

def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples

Example #14

0

Show file

File: schema2sql.py Project: Yelp/mycroft

def main():
    schema = RedShiftLogSchema(safe_load(sys.stdin))
    sql_str = tables_to_sql(schema.tables())
    sys.stdout.write(sql_str)

Example #15

0

Show file

File: maint.py Project: Yelp/mycroft

def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures)
        )


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)

Example #16

0

Show file

File: s3_to_redshift.py Project: wlstyy/mycroft

def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples,
                ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate, log_tuple,
                                ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb':
                ''.join(traceback.format_tb(exc_tb)),
                'crash_exc':
                traceback.format_exception_only(exc_type,
                                                exc_value)[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate)
                status_helper.update_status(db_name,
                                            ddate,
                                            yaml_versions,
                                            "error",
                                            start_time_secs=start,
                                            error_msg=error_msg)
                handle_error(error_msg, logstream)
    status_helper.update_status(db_name,
                                ddate,
                                yaml_versions,
                                "complete",
                                start_time_secs=start)

Example #17

0

Show file

File: mk_log_schema.py Project: Yelp/mycroft

    parser = argparse.ArgumentParser(
        description="""
This is a tool to maniuplate Redshift schema without manually editing the yaml
defintion. The program takes its input from standard input.

Example Usage:

    cat schema/db.yaml | %(prog)s table_create -s '' --source-type DICT
        """,
        formatter_class=argparse.RawTextHelpFormatter,
    )

    subparsers = parser.add_subparsers()
    set_table_create_parser(subparsers)
    set_table_delete_parser(subparsers)
    set_column_add_parser(subparsers)
    set_column_remove_parser(subparsers)
    set_sortkeys_add_parser(subparsers)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = get_cmd_line_args()
    schema = RedShiftLogSchema(yaml.load(sys.stdin.read()) or dict())
    version = schema.version_get()
    args.func(schema, args)
    schema.version_set(version + 1)
    sys.stdout.write(RedShiftLogSchema.header())
    sys.stdout.write(yaml.dump(schema.schema(), default_flow_style=False))

Example #18

0

Show file

File: table_rename.py Project: Yelp/mycroft

            --suffix '_test_5'
"""


import argparse
import sys
import yaml

from sherlock.common.redshift_schema import RedShiftLogSchema


def create_new_yaml(suffix, input_file=sys.stdin):

    schema = RedShiftLogSchema(yaml.load(input_file))
    for table_name in schema.tables().keys():
        schema.table_rename(
            table_name, '{original_name}{suffix}'.format(
                original_name=table_name,
                suffix=suffix
            )
        )
    return yaml.dump(schema.schema(), default_flow_style=False)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Rename table with suffix')
    parser.add_argument('--suffix')
    args = parser.parse_args()
    print RedShiftLogSchema.header()
    print create_new_yaml(args.suffix)

Example #19

0

Show file


def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures))


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)

Example #20

0

Show file

File: schema2sql.py Project: wlstyy/mycroft

def main():
    schema = RedShiftLogSchema(safe_load(sys.stdin))
    sql_str = tables_to_sql(schema.tables())
    sys.stdout.write(sql_str)

Example #21

0

Show file

Usage:

    cat schema/db.yaml  | sherlock/tools/table_rename.py \
            --suffix '_test_5'
"""

import argparse
import sys
import yaml

from sherlock.common.redshift_schema import RedShiftLogSchema


def create_new_yaml(suffix, input_file=sys.stdin):

    schema = RedShiftLogSchema(yaml.load(input_file))
    for table_name in schema.tables().keys():
        schema.table_rename(
            table_name,
            '{original_name}{suffix}'.format(original_name=table_name,
                                             suffix=suffix))
    return yaml.dump(schema.schema(), default_flow_style=False)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Rename table with suffix')
    parser.add_argument('--suffix')
    args = parser.parse_args()
    print RedShiftLogSchema.header()
    print create_new_yaml(args.suffix)

Example #22

0

Show file

File: log2schema.py Project: wlstyy/mycroft

        "--add-source-filename",
        action='store_true',
        help="[%(default)s] column indicating source of each table row")
    parser.add_argument(
        "--merge-with-schema",
        metavar='PATH_TO_SCHEMA_FILE',
        help="path to schema file into which new table is merged")
    args = parser.parse_args()
    args.foreign.sort()
    return args


if __name__ == "__main__":
    args = get_cmd_line_args()

    schema = RedShiftLogSchema()
    version = 0

    if args.merge_with_schema:
        with open(args.merge_with_schema, 'r') as yaml_file:
            schema = RedShiftLogSchema(yaml.safe_load(yaml_file))
            if schema.get_table(args.table) is not None:
                schema.table_delete(args.table)
            version = schema.version_get()

    sm = RedShiftSchemaMaker(exclude=args.exclude, prune=args.prune)
    for line in fileinput.input('-'):
        process_row(sm, simplejson.loads(line), args)

    table = sm.mk_table(sm.schema,
                        args.table,

Example #23

0

Show file

File: log2schema.py Project: Yelp/mycroft

        "--add-source-filename", action='store_true',
        help="[%(default)s] column indicating source of each table row"
    )
    parser.add_argument(
        "--merge-with-schema", metavar='PATH_TO_SCHEMA_FILE',
        help="path to schema file into which new table is merged"
    )
    args = parser.parse_args()
    args.foreign.sort()
    return args


if __name__ == "__main__":
    args = get_cmd_line_args()

    schema = RedShiftLogSchema()
    version = 0

    if args.merge_with_schema:
        with open(args.merge_with_schema, 'r') as yaml_file:
            schema = RedShiftLogSchema(yaml.safe_load(yaml_file))
            if schema.get_table(args.table) is not None:
                schema.table_delete(args.table)
            version = schema.version_get()

    sm = RedShiftSchemaMaker(exclude=args.exclude, prune=args.prune)
    for line in fileinput.input('-'):
        process_row(sm, simplejson.loads(line), args)

    table = sm.mk_table(sm.schema, args.table, args.source, args.source_type,
                        add_source_filename=args.add_source_filename)

Example #24

0

Show file

File: mk_log_schema.py Project: wlstyy/mycroft

    parser = argparse.ArgumentParser(
        description="""
This is a tool to maniuplate Redshift schema without manually editing the yaml
defintion. The program takes its input from standard input.

Example Usage:

    cat schema/db.yaml | %(prog)s table_create -s '' --source-type DICT
        """,
        formatter_class=argparse.RawTextHelpFormatter,
    )

    subparsers = parser.add_subparsers()
    set_table_create_parser(subparsers)
    set_table_delete_parser(subparsers)
    set_column_add_parser(subparsers)
    set_column_remove_parser(subparsers)
    set_sortkeys_add_parser(subparsers)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = get_cmd_line_args()
    schema = RedShiftLogSchema(yaml.load(sys.stdin.read()) or dict())
    version = schema.version_get()
    args.func(schema, args)
    schema.version_set(version + 1)
    sys.stdout.write(RedShiftLogSchema.header())
    sys.stdout.write(yaml.dump(schema.schema(), default_flow_style=False))