Python get_namespaced_tablenameの例、sherlock.common.redshift_psql.get_namespaced_tablename Pythonの例

コード例 #1

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table,
                                                   tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults,
                        logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(
                    namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)

コード例 #2

0

ファイルを表示

def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures))

コード例 #3

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add,
                        tbl_tuple, defaults, logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)

コード例 #4

0

ファイルを表示

ファイル: rs_mgmt.py プロジェクト: wlstyy/mycroft

def rs_check_table_rows(psql, db, tables, schemaname):
    """ make sure that at least 1 row is available in each table """
    template = "select count(*) from {0}"
    for table in tables:
        namespaced_table = get_namespaced_tablename(table, schemaname=schemaname)
        query = template.format(namespaced_table)
        result = psql.run_sql(query, db, query, output=True)
        if result is False:
            raise ValueError("Error occurred, see {0} \
scribe log".format(psql.log_stream))
        if result[0][0] <= 0:
            raise ValueError("{0}: has zero rows".format(namespaced_table))

コード例 #5

0

ファイルを表示

ファイル: maint.py プロジェクト: Yelp/mycroft

def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures)
        )

コード例 #6

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def get_min_max_date(psql, db_name, table, column):
    """
    Determine oldest, freshest data in a table

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    table -- table name
    column -- timestamp column name found via get_timestamp_column_name

    Return: min_date, max_date
    """
    query = QUERY_GET_MIN_MAX_DATE.format(get_namespaced_tablename(table),
                                          column)
    result = psql.run_sql(query, db_name, "get min,max date", output=True)
    return result[0][0], result[0][1]

コード例 #7

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def get_min_max_date(psql, db_name, table, column):
    """
    Determine oldest, freshest data in a table

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    table -- table name
    column -- timestamp column name found via get_timestamp_column_name

    Return: min_date, max_date
    """
    query = QUERY_GET_MIN_MAX_DATE.format(
        get_namespaced_tablename(table),
        column
    )
    result = psql.run_sql(query, db_name, "get min,max date", output=True)
    return result[0][0], result[0][1]

コード例 #8

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def delete_old_data(psql, db_name, table, ttl_days):
    """
    Delete data older than TTL.  Round-down min_date, max_date to 00:00:00 UTC
    for cutoff calculation.

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    table -- table name
    ttl_days -- max TTL of data in a table

    Return: None
    """
    cname = get_timestamp_column_name(psql, db_name, table)
    if cname is None:
        return 0
    dt_min_date, dt_max_date = get_min_max_date(psql, db_name, table, cname)
    if dt_min_date is None or dt_max_date is None:
        return 0

    # cutoff is always YYYY-MM-DD 00:00:00
    dt_min = datetime(dt_min_date.year, dt_min_date.month, dt_min_date.day)
    dt_max = datetime(dt_max_date.year, dt_max_date.month, dt_max_date.day)

    num_days = (dt_max - dt_min).days
    num_deleted = 0
    if ttl_days is not None and num_days > ttl_days:
        dt_new_min_date = dt_min + timedelta(days=num_days - ttl_days)
        new_min_date = datetime.strftime(dt_new_min_date, "%Y-%m-%d %H:%M:%S")
        query = QUERY_DELETE_ROWS_BY_DATE.format(
            get_namespaced_tablename(table),
            cname
        )
        params = {'new_min_date': new_min_date}
        result = psql.run_sql_ex(query, db_name, "delete rows", params=params)
        if result is not False:
            match = re.search(r"^DELETE\s+(?P<num_deleted>\d+)$",
                              result.get('status', ''))
            num_deleted = int(match.group('num_deleted')) if match else 0
        if num_deleted <= 0:
            raise ValueError("nothing to delete for {0}".format(table))
    return num_deleted

コード例 #9

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream):
    s3_log, rs_table = log_tuple
    namespaced_table_name = get_namespaced_tablename(rs_table)
    table_start = time.time()
    extra_msg = "from s3 log: {0}".format(s3_log)
    logstream.write_msg('starting', extra_msg=extra_msg)

    # about to load new day, remove oldest
    rows_deleted = None
    if ttl_days is not None:
        rows_deleted = \
            delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1)
    if rows_deleted:
        logstream.write_msg('delete_ok',
                            extra_msg="{0} rows".format(rows_deleted))

    # Try to reclaim disk space.  If not needed, it will be fast.
    # Calling here and not in the 'if rows_deleted' code to prevent
    # scenario where rows were deleted but compact failed. Then on retry
    # there will be nothing to delete but since space is not reclaimed
    # there may not be enough for a new load, resulting in failure forever.
    if ttl_days is not None:
        compact_table(psql_helper, db_name, namespaced_table_name)

    delimiter = read_string('redshift_column_delimiter')
    delimiter = delimiter.decode("string_escape")
    if delimiter not in string.printable:
        delimiter = '\\' + oct(ord(delimiter))

    copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter)
    result = psql_helper.run_sql(
        copy_sql,
        db_name,
        " copying from " + s3_log,
        s3_needed=True,
        time_est_secs=read_int('pipeline.load_step.copy_time_est_secs'))
    if result is not False:
        logstream.write_msg('complete',
                            job_start_secs=table_start,
                            extra_msg=extra_msg)
    return result

コード例 #10

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def delete_old_data(psql, db_name, table, ttl_days):
    """
    Delete data older than TTL.  Round-down min_date, max_date to 00:00:00 UTC
    for cutoff calculation.

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    table -- table name
    ttl_days -- max TTL of data in a table

    Return: None
    """
    cname = get_timestamp_column_name(psql, db_name, table)
    if cname is None:
        return 0
    dt_min_date, dt_max_date = get_min_max_date(psql, db_name, table, cname)
    if dt_min_date is None or dt_max_date is None:
        return 0

    # cutoff is always YYYY-MM-DD 00:00:00
    dt_min = datetime(dt_min_date.year, dt_min_date.month, dt_min_date.day)
    dt_max = datetime(dt_max_date.year, dt_max_date.month, dt_max_date.day)

    num_days = (dt_max - dt_min).days
    num_deleted = 0
    if ttl_days is not None and num_days > ttl_days:
        dt_new_min_date = dt_min + timedelta(days=num_days - ttl_days)
        new_min_date = datetime.strftime(dt_new_min_date, "%Y-%m-%d %H:%M:%S")
        query = QUERY_DELETE_ROWS_BY_DATE.format(
            get_namespaced_tablename(table), cname)
        params = {'new_min_date': new_min_date}
        result = psql.run_sql_ex(query, db_name, "delete rows", params=params)
        if result is not False:
            match = re.search(r"^DELETE\s+(?P<num_deleted>\d+)$",
                              result.get('status', ''))
            num_deleted = int(match.group('num_deleted')) if match else 0
        if num_deleted <= 0:
            raise ValueError("nothing to delete for {0}".format(table))
    return num_deleted

コード例 #11

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream):
    s3_log, rs_table = log_tuple
    namespaced_table_name = get_namespaced_tablename(rs_table)
    table_start = time.time()
    extra_msg = "from s3 log: {0}".format(s3_log)
    logstream.write_msg('starting', extra_msg=extra_msg)

    # about to load new day, remove oldest
    rows_deleted = None
    if ttl_days is not None:
        rows_deleted = \
            delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1)
    if rows_deleted:
        logstream.write_msg('delete_ok',
                            extra_msg="{0} rows".format(rows_deleted))

    # Try to reclaim disk space.  If not needed, it will be fast.
    # Calling here and not in the 'if rows_deleted' code to prevent
    # scenario where rows were deleted but compact failed. Then on retry
    # there will be nothing to delete but since space is not reclaimed
    # there may not be enough for a new load, resulting in failure forever.
    if ttl_days is not None:
        compact_table(psql_helper, db_name, namespaced_table_name)

    delimiter = read_string('redshift_column_delimiter')
    delimiter = delimiter.decode("string_escape")
    if delimiter not in string.printable:
        delimiter = '\\' + oct(ord(delimiter))

    copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter)
    result = psql_helper.run_sql(
        copy_sql,
        db_name, " copying from " + s3_log,
        s3_needed=True,
        time_est_secs=read_int('pipeline.load_step.copy_time_est_secs')
    )
    if result is not False:
        logstream.write_msg('complete', job_start_secs=table_start,
                            extra_msg=extra_msg)
    return result

コード例 #12

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples

コード例 #13

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples

コード例 #14

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def add_columns(psql, db, ddate, table, to_add,
                tbl_tuple, defaults, logstream):
    """
    Add new columns to existing table.  Copy data into temp table
    to detect encoding.

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    table -- table name where to add columns -- not namespaced
    to_add -- list of columns to add
    tbl_tuple -- a tuple containing path to table in PSV format and a
                 Redshift temporary table where to load data
    logstream -- a PipelineStreamLogger

    Return: None
    """
    _, tmp_tbl_name = tbl_tuple
    for row in to_add:
        if row[PgTableDef.SortKey] or row[PgTableDef.DistKey]:
            raise ValueError("{0}: {1} new column is a sortkey \
or distkey".format(table, row[PgTableDef.Column]))

    if to_add:
        # copy data into tmp_tbl_name in order to detect encoding
        copy_table(psql, db, ddate, tbl_tuple, MAX_TTL_DAYS, logstream)
        tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
        for row in tmp_tbl_def[len(tmp_tbl_def) - len(to_add):]:
            encoding = row[PgTableDef.Encoding]
            query = QUERY_ADD_COLUMN.format(
                get_namespaced_tablename(table),
                row[PgTableDef.Column],
                row[PgTableDef.Type],
                "raw" if encoding == "none" else encoding,
                "not null" if row[PgTableDef.NotNull] else "null",
                defaults[row[PgTableDef.Column]]
            )
            psql.run_sql(query, db, query)

コード例 #15

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults,
                logstream):
    """
    Add new columns to existing table.  Copy data into temp table
    to detect encoding.

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    table -- table name where to add columns -- not namespaced
    to_add -- list of columns to add
    tbl_tuple -- a tuple containing path to table in PSV format and a
                 Redshift temporary table where to load data
    logstream -- a PipelineStreamLogger

    Return: None
    """
    _, tmp_tbl_name = tbl_tuple
    for row in to_add:
        if row[PgTableDef.SortKey] or row[PgTableDef.DistKey]:
            raise ValueError("{0}: {1} new column is a sortkey \
or distkey".format(table, row[PgTableDef.Column]))

    if to_add:
        # copy data into tmp_tbl_name in order to detect encoding
        copy_table(psql, db, ddate, tbl_tuple, MAX_TTL_DAYS, logstream)
        tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
        for row in tmp_tbl_def[len(tmp_tbl_def) - len(to_add):]:
            encoding = row[PgTableDef.Encoding]
            query = QUERY_ADD_COLUMN.format(
                get_namespaced_tablename(table), row[PgTableDef.Column],
                row[PgTableDef.Type],
                "raw" if encoding == "none" else encoding,
                "not null" if row[PgTableDef.NotNull] else "null",
                defaults[row[PgTableDef.Column]])
            psql.run_sql(query, db, query)

コード例 #16

0

ファイルを表示

def compact_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        compact_table(psql, db, tbl_name, run_now=True)

コード例 #17

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def copy_tables(psql_helper, status_helper,
                db_name, ddate, log_tuples, ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate,
                                log_tuple, ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb': ''.join(traceback.format_tb(exc_tb)),
                'crash_exc': traceback.format_exception_only(
                    exc_type, exc_value
                )[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate
                    )
                status_helper.update_status(
                    db_name, ddate, yaml_versions,
                    "error", start_time_secs=start, error_msg=error_msg
                )
                handle_error(error_msg, logstream)
    status_helper.update_status(
        db_name, ddate, yaml_versions, "complete", start_time_secs=start
    )

コード例 #18

0

ファイルを表示

ファイル: test_redshift_psql.py プロジェクト: Yelp/mycroft

def test_get_namespaced_tablename_config(input_config, expected_out):
    filepath = os.path.join('tests', 'common', input_config)
    YamlConfiguration(filepath)
    output_under_test = get_namespaced_tablename("table_name_blah")
    assert output_under_test == expected_out

コード例 #19

0

ファイルを表示

ファイル: test_redshift_psql.py プロジェクト: Yelp/mycroft

def test_get_namespaced_tablename_arg(input_schema, expected_out):
    output_under_test = get_namespaced_tablename(
        "table_name_blah",
        schemaname=input_schema
    )
    assert output_under_test == expected_out

コード例 #20

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples,
                ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate, log_tuple,
                                ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb':
                ''.join(traceback.format_tb(exc_tb)),
                'crash_exc':
                traceback.format_exception_only(exc_type,
                                                exc_value)[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate)
                status_helper.update_status(db_name,
                                            ddate,
                                            yaml_versions,
                                            "error",
                                            start_time_secs=start,
                                            error_msg=error_msg)
                handle_error(error_msg, logstream)
    status_helper.update_status(db_name,
                                ddate,
                                yaml_versions,
                                "complete",
                                start_time_secs=start)

コード例 #21

0

ファイルを表示

ファイル: test_redshift_psql.py プロジェクト: wlstyy/mycroft

def test_get_namespaced_tablename_arg(input_schema, expected_out):
    output_under_test = get_namespaced_tablename("table_name_blah",
                                                 schemaname=input_schema)
    assert output_under_test == expected_out

コード例 #22

0

ファイルを表示

ファイル: test_redshift_psql.py プロジェクト: wlstyy/mycroft

def test_get_namespaced_tablename_config(input_config, expected_out):
    filepath = os.path.join('tests', 'common', input_config)
    YamlConfiguration(filepath)
    output_under_test = get_namespaced_tablename("table_name_blah")
    assert output_under_test == expected_out

コード例 #23

0

ファイルを表示

ファイル: maint.py プロジェクト: Yelp/mycroft

def compact_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        compact_table(psql, db, tbl_name, run_now=True)