Python get_yaml_table_versionsの例、sherlock.common.pipeline.get_yaml_table_versions Pythonの例

コード例 #1

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def dates_from_rs_status(status_helper,
                         db,
                         logstream,
                         retry_on_err,
                         single_date=None):
    """
    date_from_rs_status gets the jobs that have completed the et step, but
    have not started the load step, and have no jobs before them running or
    in error

    Args:
    status_helper -- a wrapper around a backing store to aid in CRUD
    db -- is the database we query
    logstream -- a PipelineStreamLogger
    retry_on_err -- a boolean, True if we're retrying on errors
    single_date -- date string of the form YYYY-MM-DD if we're \
        only looking for one

    Returns:
    a list of dates to catch up on formatted as strings YYYY/MM/DD
    """
    versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())

    if single_date is not None:
        data_date = get_formatted_date(single_date)
        if data_date is None:
            handle_error("bad input date: {0}".format(single_date), logstream)
        start_datetime = datetime.strptime(data_date, "%Y/%m/%d")
        status_tuples = \
            status_helper.query_et_complete_job(db, versions, data_date)
    else:
        days_back = read_int('pipeline.load_step.days_to_check') + 1
        start_datetime = datetime.utcnow() - timedelta(days=days_back)
        status_tuples = \
            status_helper.query_et_complete_jobs(db, versions, start_datetime)

    if status_tuples is False:
        handle_error(
            "query for complete et job failed, version={0}, date={1}".format(
                versions,
                data_date if single_date is not None else start_datetime),
            logstream)

    candidates = []
    last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d")
    for ddate, ld_status in status_tuples:
        if not one_day_greater(ddate, last_date):
            break
        elif ld_status is None or (ld_status == 'error' and retry_on_err):
            candidates.append(ddate)
        elif ld_status == 'error':
            break
        last_date = ddate
    candidate_string = "candidates dates for load: {0}".format(candidates)
    logstream.write_msg(status='running', extra_msg=candidate_string)
    return candidates

コード例 #2

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def dates_from_rs_status(status_helper, db, logstream,
                         retry_on_err, single_date=None):
    """
    date_from_rs_status gets the jobs that have completed the et step, but
    have not started the load step, and have no jobs before them running or
    in error

    Args:
    status_helper -- a wrapper around a backing store to aid in CRUD
    db -- is the database we query
    logstream -- a PipelineStreamLogger
    retry_on_err -- a boolean, True if we're retrying on errors
    single_date -- date string of the form YYYY-MM-DD if we're \
        only looking for one

    Returns:
    a list of dates to catch up on formatted as strings YYYY/MM/DD
    """
    versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())

    if single_date is not None:
        data_date = get_formatted_date(single_date)
        if data_date is None:
            handle_error("bad input date: {0}".format(single_date), logstream)
        start_datetime = datetime.strptime(data_date, "%Y/%m/%d")
        status_tuples = \
            status_helper.query_et_complete_job(db, versions, data_date)
    else:
        days_back = read_int('pipeline.load_step.days_to_check') + 1
        start_datetime = datetime.utcnow() - timedelta(days=days_back)
        status_tuples = \
            status_helper.query_et_complete_jobs(db, versions, start_datetime)

    if status_tuples is False:
        handle_error(
            "query for complete et job failed, version={0}, date={1}".format(
                versions,
                data_date if single_date is not None else start_datetime
            ),
            logstream
        )

    candidates = []
    last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d")
    for ddate, ld_status in status_tuples:
        if not one_day_greater(ddate, last_date):
            break
        elif ld_status is None or (ld_status == 'error' and retry_on_err):
            candidates.append(ddate)
        elif ld_status == 'error':
            break
        last_date = ddate
    candidate_string = "candidates dates for load: {0}".format(candidates)
    logstream.write_msg(status='running', extra_msg=candidate_string)
    return candidates

コード例 #3

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown')
    )

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        's3_to_redshift',
        job_name='load'
    )

    # handle to redshift db
    loader_psql = RedshiftPostgres(
        LOG_STREAM, args.private, run_local=args.run_local
    )

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(
            LOG_STREAM, run_local=args.run_local
        )
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(
                loader_psql,
                db,
                data_candidates[0],
                s3_log_prefix,
                args.db_file,
                LOG_STREAM
            )
        except Exception as e:
            status_table.update_status(
                db,
                data_candidates[0],
                get_yaml_table_versions(pipeline_yaml_schema_file_path()),
                "error",
                start_time_secs=time.time(), error_msg=repr(e)
            )
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(
            stream_name,
            args.run_local,
            's3_to_redshift',
            job_name='load',
            input_date=input_date
        )
        logs_to_copy = [
            (join(s3_log_prefix, input_date, table), table)
            for (table, _) in create_tuples
        ]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)

コード例 #4

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: Yelp/mycroft

def copy_tables(psql_helper, status_helper,
                db_name, ddate, log_tuples, ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate,
                                log_tuple, ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb': ''.join(traceback.format_tb(exc_tb)),
                'crash_exc': traceback.format_exception_only(
                    exc_type, exc_value
                )[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate
                    )
                status_helper.update_status(
                    db_name, ddate, yaml_versions,
                    "error", start_time_secs=start, error_msg=error_msg
                )
                handle_error(error_msg, logstream)
    status_helper.update_status(
        db_name, ddate, yaml_versions, "complete", start_time_secs=start
    )

コード例 #5

0

ファイルを表示

def test_new_get_yaml_table_version():
    expected_value = "aaaa bbbb cccc 2"
    output_under_test = get_yaml_table_versions(
        'tests/common/test_db_new.yaml')
    assert output_under_test == expected_value

コード例 #6

0

ファイルを表示

def test_old_get_yaml_table_version():
    expected_value = "aaaa: 1 bbbb: 2 cccc: 3"
    output_under_test = get_yaml_table_versions(
        'tests/common/test_db_old.yaml')
    assert output_under_test == expected_value

コード例 #7

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown'))

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(stream_name,
                                      args.run_local,
                                      's3_to_redshift',
                                      job_name='load')

    # handle to redshift db
    loader_psql = RedshiftPostgres(LOG_STREAM,
                                   args.private,
                                   run_local=args.run_local)

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(LOG_STREAM,
                                           run_local=args.run_local)
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(loader_psql, db, data_candidates[0],
                                   s3_log_prefix, args.db_file, LOG_STREAM)
        except Exception as e:
            status_table.update_status(db,
                                       data_candidates[0],
                                       get_yaml_table_versions(
                                           pipeline_yaml_schema_file_path()),
                                       "error",
                                       start_time_secs=time.time(),
                                       error_msg=repr(e))
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(stream_name,
                                          args.run_local,
                                          's3_to_redshift',
                                          job_name='load',
                                          input_date=input_date)
        logs_to_copy = [(join(s3_log_prefix, input_date, table), table)
                        for (table, _) in create_tuples]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)

コード例 #8

0

ファイルを表示

ファイル: s3_to_redshift.py プロジェクト: wlstyy/mycroft

def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples,
                ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate, log_tuple,
                                ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb':
                ''.join(traceback.format_tb(exc_tb)),
                'crash_exc':
                traceback.format_exception_only(exc_type,
                                                exc_value)[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate)
                status_helper.update_status(db_name,
                                            ddate,
                                            yaml_versions,
                                            "error",
                                            start_time_secs=start,
                                            error_msg=error_msg)
                handle_error(error_msg, logstream)
    status_helper.update_status(db_name,
                                ddate,
                                yaml_versions,
                                "complete",
                                start_time_secs=start)

コード例 #9

0

ファイルを表示

ファイル: s3_to_psv.py プロジェクト: Yelp/mycroft

def __load_data_from_s3(
        status_helper, prefixes, date_with_slashes,
        mrjob_path, local, db_name, logstream, force_et=False
        ):
    """
    load_data_from_s3 iterates over prefixes and loads data for a
    particular date for the first prefix where the data exists.  It also
    checks whether data has already been loaded for a date and if so, skips
    the load

    Args:
    status_helper -- An object handle to interact with status table
    prefixes -- a list of s3 prefixes for input data
    date_with_slashes -- a date string of the form 'YYYY/MM/DD'
    mrjob_path -- module.entry_point of the job to extract and \
        transform the data
    local -- True if we're running locally (i.e., devc) False for aws instance
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start_time = time.time()

    table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    conditions = {
        'table_versions': table_versions,
        'data_date': date_with_slashes
    }
    if status_helper.et_started(conditions, db_name):
        logstream.write_msg(
            "complete",
            extra_msg="skipping: et_step already started"
        )
        return

    prefix_for_this_data = get_next_dir_to_load(
        prefixes, date_with_slashes, local, logstream, force_et
    )
    if not prefix_for_this_data:
        jobtime = 0
        err_msg = "no prefix available date={0}, prefixes={1}".format(
            date_with_slashes, prefixes
        )
        logstream.write_msg("error", error_msg=err_msg)
        status_helper.log_status_result(
            conditions,
            jobtime,
            db_name,
            failed=True, err_msg=err_msg
        )
        raise Exception(err_msg)

    # check if mrjob is already done
    data_we_check = "{0} {1} {2}".format(
        get_s3_output_user_prefix(),
        date_with_slashes,
        local
    )
    logstream.write_msg("running", extra_msg=data_we_check)
    if data_available(
        get_s3_output_user_prefix(),
        date_with_slashes,
        local,
        done_file_name='_SUCCESS'
    ):
        logstream.write_msg(
            "complete",
            extra_msg="skipping: et_step already done"
        )
        return

    jobtime = time.time()
    mrjob_args = create_emr_args(
        date_with_slashes,
        read_int('pipeline.et_step.cores'),
        prefix_for_this_data, local
    )
    status_helper.insert_et(conditions, db_name)
    logstream.write_msg("running", extra_msg=mrjob_args)

    result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream)
    failed = not result

    jobtime = time.time() - start_time
    status_helper.log_status_result(
        conditions, jobtime, db_name,
        failed=failed, err_msg=err_reason
    )
    if failed:
        raise Exception(err_reason)
    return

コード例 #10

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: Yelp/mycroft

def test_new_get_yaml_table_version():
    expected_value = "aaaa bbbb cccc 2"
    output_under_test = get_yaml_table_versions(
        'tests/common/test_db_new.yaml')
    assert output_under_test == expected_value

コード例 #11

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: Yelp/mycroft

def test_old_get_yaml_table_version():
    expected_value = "aaaa: 1 bbbb: 2 cccc: 3"
    output_under_test = get_yaml_table_versions(
        'tests/common/test_db_old.yaml')
    assert output_under_test == expected_value

コード例 #12

0

ファイルを表示

ファイル: s3_to_psv.py プロジェクト: wlstyy/mycroft

def __load_data_from_s3(status_helper,
                        prefixes,
                        date_with_slashes,
                        mrjob_path,
                        local,
                        db_name,
                        logstream,
                        force_et=False):
    """
    load_data_from_s3 iterates over prefixes and loads data for a
    particular date for the first prefix where the data exists.  It also
    checks whether data has already been loaded for a date and if so, skips
    the load

    Args:
    status_helper -- An object handle to interact with status table
    prefixes -- a list of s3 prefixes for input data
    date_with_slashes -- a date string of the form 'YYYY/MM/DD'
    mrjob_path -- module.entry_point of the job to extract and \
        transform the data
    local -- True if we're running locally (i.e., devc) False for aws instance
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start_time = time.time()

    table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    conditions = {
        'table_versions': table_versions,
        'data_date': date_with_slashes
    }
    if status_helper.et_started(conditions, db_name):
        logstream.write_msg("complete",
                            extra_msg="skipping: et_step already started")
        return

    prefix_for_this_data = get_next_dir_to_load(prefixes, date_with_slashes,
                                                local, logstream, force_et)
    if not prefix_for_this_data:
        jobtime = 0
        err_msg = "no prefix available date={0}, prefixes={1}".format(
            date_with_slashes, prefixes)
        logstream.write_msg("error", error_msg=err_msg)
        status_helper.log_status_result(conditions,
                                        jobtime,
                                        db_name,
                                        failed=True,
                                        err_msg=err_msg)
        raise Exception(err_msg)

    # check if mrjob is already done
    data_we_check = "{0} {1} {2}".format(get_s3_output_user_prefix(),
                                         date_with_slashes, local)
    logstream.write_msg("running", extra_msg=data_we_check)
    if data_available(get_s3_output_user_prefix(),
                      date_with_slashes,
                      local,
                      done_file_name='_SUCCESS'):
        logstream.write_msg("complete",
                            extra_msg="skipping: et_step already done")
        return

    jobtime = time.time()
    mrjob_args = create_emr_args(date_with_slashes,
                                 read_int('pipeline.et_step.cores'),
                                 prefix_for_this_data, local)
    status_helper.insert_et(conditions, db_name)
    logstream.write_msg("running", extra_msg=mrjob_args)

    result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream)
    failed = not result

    jobtime = time.time() - start_time
    status_helper.log_status_result(conditions,
                                    jobtime,
                                    db_name,
                                    failed=failed,
                                    err_msg=err_reason)
    if failed:
        raise Exception(err_reason)
    return