Ejemplo n.º 1
0
def test_pipeline_json():
    logger = PipelineStreamLogger(None, False, 'test')
    now = time.time()
    js = logger._pipeline_json('complete',
                               error_msg='error!',
                               extra_msg='extra',
                               job_start_secs=now)
    json_dict = simplejson.loads(js)
    assert json_dict['msg']['status'] == 'complete'
    assert json_dict['msg']['additional_info'] == 'extra'
    # if the above lines of code take longer than 10 seconds something's wrong
    assert json_dict['msg']['job_time'] < 10
    assert json_dict['tag'] == 'test'
Ejemplo n.º 2
0
def test_pipeline_json():
    logger = PipelineStreamLogger(None, False, 'test')
    now = time.time()
    js = logger._pipeline_json('complete',
                               error_msg='error!',
                               extra_msg='extra',
                               job_start_secs=now)
    json_dict = simplejson.loads(js)
    assert json_dict['msg']['status'] == 'complete'
    assert json_dict['msg']['additional_info'] == 'extra'
    # if the above lines of code take longer than 10 seconds something's wrong
    assert json_dict['msg']['job_time'] < 10
    assert json_dict['tag'] == 'test'
Ejemplo n.º 3
0
def _get_logger(run_local, tag):
    try:
        return PipelineStreamLogger(staticconf.read_string("log_stream_name"),
                                    run_local, tag)
    except:
        logger.write_msg("Error creating a pipeline stream logger!")
        return logger  # Return existing logger instance in case of errors
Ejemplo n.º 4
0
def s3_to_psv_main(args):

    mrjob = read_string('pipeline.et_step.mrjob')
    stream_name = read_string('pipeline.et_step.s3_to_s3_stream')
    DATABASE = read_string('pipeline.redshift_database')

    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        mrjob,
        input_date=args.date
    )

    day_to_run = setup_dates_to_check(args.date, args.run_local, LOG_STREAM)

    try:
        if not args.run_local:
            setup_private(args.private)
        # Create a psql instance based on args
        if args.skip_progress_in_redshift:
            status_table = DynamoDbStatusTable(
                LOG_STREAM, run_local=args.run_local
            )
        else:
            status_table = RedshiftStatusTable(
                RedshiftPostgres(
                    LOG_STREAM, args.private, run_local=args.run_local
                )
            )
        load_msg = __load_data_from_s3(
            status_table,
            read_list('pipeline.et_step.s3_prefixes'),
            day_to_run,
            mrjob,
            args.run_local,
            DATABASE,
            LOG_STREAM,
            force_et=args.force_et
        )
        LOG_STREAM.write_msg("complete", extra_msg=load_msg)

    finally:
        clear_env(args.run_local)
Ejemplo n.º 5
0
def s3_to_psv_main(args):

    mrjob = read_string('pipeline.et_step.mrjob')
    stream_name = read_string('pipeline.et_step.s3_to_s3_stream')
    DATABASE = read_string('pipeline.redshift_database')

    LOG_STREAM = PipelineStreamLogger(stream_name,
                                      args.run_local,
                                      mrjob,
                                      input_date=args.date)

    day_to_run = setup_dates_to_check(args.date, args.run_local, LOG_STREAM)

    try:
        if not args.run_local:
            setup_private(args.private)
        # Create a psql instance based on args
        if args.skip_progress_in_redshift:
            status_table = DynamoDbStatusTable(LOG_STREAM,
                                               run_local=args.run_local)
        else:
            status_table = RedshiftStatusTable(
                RedshiftPostgres(LOG_STREAM,
                                 args.private,
                                 run_local=args.run_local))
        load_msg = __load_data_from_s3(
            status_table,
            read_list('pipeline.et_step.s3_prefixes'),
            day_to_run,
            mrjob,
            args.run_local,
            DATABASE,
            LOG_STREAM,
            force_et=args.force_et)
        LOG_STREAM.write_msg("complete", extra_msg=load_msg)

    finally:
        clear_env(args.run_local)
Ejemplo n.º 6
0
def rs_check_schema(rs_mgmt, args):
    yaml_data = load_from_file(args.schema)
    tables = RedShiftLogSchema(safe_load(yaml_data)).tables()

    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    pipe_strm_lgr = PipelineStreamLogger(
        log_stream,
        True,
        'rs_check_schema'
    )
    psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True)
    rs_check_table_def(psql, db, tables, args.redshift_schema)
    rs_check_table_rows(psql, db, tables, args.redshift_schema)
Ejemplo n.º 7
0
def test_handle_error(input_value):
    test_logger = PipelineStreamLogger("test_stream", False, "test_tag")
    with pytest.raises(Exception):
        handle_error(input_value, test_logger)
Ejemplo n.º 8
0
def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown'))

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(stream_name,
                                      args.run_local,
                                      's3_to_redshift',
                                      job_name='load')

    # handle to redshift db
    loader_psql = RedshiftPostgres(LOG_STREAM,
                                   args.private,
                                   run_local=args.run_local)

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(LOG_STREAM,
                                           run_local=args.run_local)
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(loader_psql, db, data_candidates[0],
                                   s3_log_prefix, args.db_file, LOG_STREAM)
        except Exception as e:
            status_table.update_status(db,
                                       data_candidates[0],
                                       get_yaml_table_versions(
                                           pipeline_yaml_schema_file_path()),
                                       "error",
                                       start_time_secs=time.time(),
                                       error_msg=repr(e))
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(stream_name,
                                          args.run_local,
                                          's3_to_redshift',
                                          job_name='load',
                                          input_date=input_date)
        logs_to_copy = [(join(s3_log_prefix, input_date, table), table)
                        for (table, _) in create_tuples]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)
Ejemplo n.º 9
0

def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures))


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)