def dates_from_rs_status(status_helper, db, logstream, retry_on_err, single_date=None): """ date_from_rs_status gets the jobs that have completed the et step, but have not started the load step, and have no jobs before them running or in error Args: status_helper -- a wrapper around a backing store to aid in CRUD db -- is the database we query logstream -- a PipelineStreamLogger retry_on_err -- a boolean, True if we're retrying on errors single_date -- date string of the form YYYY-MM-DD if we're \ only looking for one Returns: a list of dates to catch up on formatted as strings YYYY/MM/DD """ versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) if single_date is not None: data_date = get_formatted_date(single_date) if data_date is None: handle_error("bad input date: {0}".format(single_date), logstream) start_datetime = datetime.strptime(data_date, "%Y/%m/%d") status_tuples = \ status_helper.query_et_complete_job(db, versions, data_date) else: days_back = read_int('pipeline.load_step.days_to_check') + 1 start_datetime = datetime.utcnow() - timedelta(days=days_back) status_tuples = \ status_helper.query_et_complete_jobs(db, versions, start_datetime) if status_tuples is False: handle_error( "query for complete et job failed, version={0}, date={1}".format( versions, data_date if single_date is not None else start_datetime), logstream) candidates = [] last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d") for ddate, ld_status in status_tuples: if not one_day_greater(ddate, last_date): break elif ld_status is None or (ld_status == 'error' and retry_on_err): candidates.append(ddate) elif ld_status == 'error': break last_date = ddate candidate_string = "candidates dates for load: {0}".format(candidates) logstream.write_msg(status='running', extra_msg=candidate_string) return candidates
def dates_from_rs_status(status_helper, db, logstream, retry_on_err, single_date=None): """ date_from_rs_status gets the jobs that have completed the et step, but have not started the load step, and have no jobs before them running or in error Args: status_helper -- a wrapper around a backing store to aid in CRUD db -- is the database we query logstream -- a PipelineStreamLogger retry_on_err -- a boolean, True if we're retrying on errors single_date -- date string of the form YYYY-MM-DD if we're \ only looking for one Returns: a list of dates to catch up on formatted as strings YYYY/MM/DD """ versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) if single_date is not None: data_date = get_formatted_date(single_date) if data_date is None: handle_error("bad input date: {0}".format(single_date), logstream) start_datetime = datetime.strptime(data_date, "%Y/%m/%d") status_tuples = \ status_helper.query_et_complete_job(db, versions, data_date) else: days_back = read_int('pipeline.load_step.days_to_check') + 1 start_datetime = datetime.utcnow() - timedelta(days=days_back) status_tuples = \ status_helper.query_et_complete_jobs(db, versions, start_datetime) if status_tuples is False: handle_error( "query for complete et job failed, version={0}, date={1}".format( versions, data_date if single_date is not None else start_datetime ), logstream ) candidates = [] last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d") for ddate, ld_status in status_tuples: if not one_day_greater(ddate, last_date): break elif ld_status is None or (ld_status == 'error' and retry_on_err): candidates.append(ddate) elif ld_status == 'error': break last_date = ddate candidate_string = "candidates dates for load: {0}".format(candidates) logstream.write_msg(status='running', extra_msg=candidate_string) return candidates
def s3_to_redshift_main(args): db = read_string('pipeline.redshift_database') s3_log_prefix = read_string('pipeline.s3_output_prefix').format( logname=os.environ.get('LOGNAME', 'unknown') ) # setup logging stream_name = read_string('pipeline.load_step.s3_to_redshift_stream') LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load' ) # handle to redshift db loader_psql = RedshiftPostgres( LOG_STREAM, args.private, run_local=args.run_local ) if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable( LOG_STREAM, run_local=args.run_local ) else: status_table = RedshiftStatusTable(loader_psql) create_tuples = get_table_creates(args.db_file, LOG_STREAM) data_candidates = dates_from_rs_status( status_table, db, LOG_STREAM, args.retry_errors, args.date, ) if data_candidates: try: update_database_schema( loader_psql, db, data_candidates[0], s3_log_prefix, args.db_file, LOG_STREAM ) except Exception as e: status_table.update_status( db, data_candidates[0], get_yaml_table_versions(pipeline_yaml_schema_file_path()), "error", start_time_secs=time.time(), error_msg=repr(e) ) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date ) logs_to_copy = [ (join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples ] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM)
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only( exc_type, exc_value )[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate ) status_helper.update_status( db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg ) handle_error(error_msg, logstream) status_helper.update_status( db_name, ddate, yaml_versions, "complete", start_time_secs=start )
def test_new_get_yaml_table_version(): expected_value = "aaaa bbbb cccc 2" output_under_test = get_yaml_table_versions( 'tests/common/test_db_new.yaml') assert output_under_test == expected_value
def test_old_get_yaml_table_version(): expected_value = "aaaa: 1 bbbb: 2 cccc: 3" output_under_test = get_yaml_table_versions( 'tests/common/test_db_old.yaml') assert output_under_test == expected_value
def s3_to_redshift_main(args): db = read_string('pipeline.redshift_database') s3_log_prefix = read_string('pipeline.s3_output_prefix').format( logname=os.environ.get('LOGNAME', 'unknown')) # setup logging stream_name = read_string('pipeline.load_step.s3_to_redshift_stream') LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load') # handle to redshift db loader_psql = RedshiftPostgres(LOG_STREAM, args.private, run_local=args.run_local) if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable(LOG_STREAM, run_local=args.run_local) else: status_table = RedshiftStatusTable(loader_psql) create_tuples = get_table_creates(args.db_file, LOG_STREAM) data_candidates = dates_from_rs_status( status_table, db, LOG_STREAM, args.retry_errors, args.date, ) if data_candidates: try: update_database_schema(loader_psql, db, data_candidates[0], s3_log_prefix, args.db_file, LOG_STREAM) except Exception as e: status_table.update_status(db, data_candidates[0], get_yaml_table_versions( pipeline_yaml_schema_file_path()), "error", start_time_secs=time.time(), error_msg=repr(e)) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date) logs_to_copy = [(join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM)
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only(exc_type, exc_value)[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate) status_helper.update_status(db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg) handle_error(error_msg, logstream) status_helper.update_status(db_name, ddate, yaml_versions, "complete", start_time_secs=start)
def __load_data_from_s3( status_helper, prefixes, date_with_slashes, mrjob_path, local, db_name, logstream, force_et=False ): """ load_data_from_s3 iterates over prefixes and loads data for a particular date for the first prefix where the data exists. It also checks whether data has already been loaded for a date and if so, skips the load Args: status_helper -- An object handle to interact with status table prefixes -- a list of s3 prefixes for input data date_with_slashes -- a date string of the form 'YYYY/MM/DD' mrjob_path -- module.entry_point of the job to extract and \ transform the data local -- True if we're running locally (i.e., devc) False for aws instance logstream -- a PipelineStreamLogger Returns: --- """ start_time = time.time() table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) conditions = { 'table_versions': table_versions, 'data_date': date_with_slashes } if status_helper.et_started(conditions, db_name): logstream.write_msg( "complete", extra_msg="skipping: et_step already started" ) return prefix_for_this_data = get_next_dir_to_load( prefixes, date_with_slashes, local, logstream, force_et ) if not prefix_for_this_data: jobtime = 0 err_msg = "no prefix available date={0}, prefixes={1}".format( date_with_slashes, prefixes ) logstream.write_msg("error", error_msg=err_msg) status_helper.log_status_result( conditions, jobtime, db_name, failed=True, err_msg=err_msg ) raise Exception(err_msg) # check if mrjob is already done data_we_check = "{0} {1} {2}".format( get_s3_output_user_prefix(), date_with_slashes, local ) logstream.write_msg("running", extra_msg=data_we_check) if data_available( get_s3_output_user_prefix(), date_with_slashes, local, done_file_name='_SUCCESS' ): logstream.write_msg( "complete", extra_msg="skipping: et_step already done" ) return jobtime = time.time() mrjob_args = create_emr_args( date_with_slashes, read_int('pipeline.et_step.cores'), prefix_for_this_data, local ) status_helper.insert_et(conditions, db_name) logstream.write_msg("running", extra_msg=mrjob_args) result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream) failed = not result jobtime = time.time() - start_time status_helper.log_status_result( conditions, jobtime, db_name, failed=failed, err_msg=err_reason ) if failed: raise Exception(err_reason) return
def __load_data_from_s3(status_helper, prefixes, date_with_slashes, mrjob_path, local, db_name, logstream, force_et=False): """ load_data_from_s3 iterates over prefixes and loads data for a particular date for the first prefix where the data exists. It also checks whether data has already been loaded for a date and if so, skips the load Args: status_helper -- An object handle to interact with status table prefixes -- a list of s3 prefixes for input data date_with_slashes -- a date string of the form 'YYYY/MM/DD' mrjob_path -- module.entry_point of the job to extract and \ transform the data local -- True if we're running locally (i.e., devc) False for aws instance logstream -- a PipelineStreamLogger Returns: --- """ start_time = time.time() table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) conditions = { 'table_versions': table_versions, 'data_date': date_with_slashes } if status_helper.et_started(conditions, db_name): logstream.write_msg("complete", extra_msg="skipping: et_step already started") return prefix_for_this_data = get_next_dir_to_load(prefixes, date_with_slashes, local, logstream, force_et) if not prefix_for_this_data: jobtime = 0 err_msg = "no prefix available date={0}, prefixes={1}".format( date_with_slashes, prefixes) logstream.write_msg("error", error_msg=err_msg) status_helper.log_status_result(conditions, jobtime, db_name, failed=True, err_msg=err_msg) raise Exception(err_msg) # check if mrjob is already done data_we_check = "{0} {1} {2}".format(get_s3_output_user_prefix(), date_with_slashes, local) logstream.write_msg("running", extra_msg=data_we_check) if data_available(get_s3_output_user_prefix(), date_with_slashes, local, done_file_name='_SUCCESS'): logstream.write_msg("complete", extra_msg="skipping: et_step already done") return jobtime = time.time() mrjob_args = create_emr_args(date_with_slashes, read_int('pipeline.et_step.cores'), prefix_for_this_data, local) status_helper.insert_et(conditions, db_name) logstream.write_msg("running", extra_msg=mrjob_args) result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream) failed = not result jobtime = time.time() - start_time status_helper.log_status_result(conditions, jobtime, db_name, failed=failed, err_msg=err_reason) if failed: raise Exception(err_reason) return