def run(self): app.logger.info(f'Starting student schema creation job...') app.logger.info(f'Executing SQL...') resolved_ddl = resolve_sql_template( 'create_student_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info( f"Schema '{app.config['REDSHIFT_SCHEMA_STUDENT']}' found or created." ) else: app.logger.error(f'Student schema creation failed.') return False resolved_ddl_staging = resolve_sql_template( 'create_student_schema.template.sql', redshift_schema_student=app.config['REDSHIFT_SCHEMA_STUDENT'] + '_staging', ) if redshift.execute_ddl_script(resolved_ddl_staging): app.logger.info( f"Schema '{app.config['REDSHIFT_SCHEMA_STUDENT']}_staging' found or created." ) else: app.logger.error(f'Student staging schema creation failed.') return False return True
def run(self): app.logger.info('Starting intermediate table generation job...') sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config[ 'FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS'] resolved_ddl_redshift = resolve_sql_template( 'create_intermediate_schema.template.sql', current_term_id=current_term_id(), redshift_schema_sis=sis_source_schema, ) if redshift.execute_ddl_script(resolved_ddl_redshift): app.logger.info('Redshift tables generated.') else: raise BackgroundJobError('Intermediate table creation job failed.') resolved_ddl_rds = resolve_sql_template( 'update_rds_indexes_sis.template.sql') if rds.execute(resolved_ddl_rds): app.logger.info('RDS indexes updated.') else: raise BackgroundJobError( 'Failed to update RDS indexes for intermediate schema.') return 'Intermediate table generation job completed.'
def run(self): app.logger.info('Start generating canvas caliper analytics') redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER'] redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS'] canvas_caliper_explode_table = 'canvas_caliper_explode' # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch # may have been delivered before last midnight UTC. s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path() if not s3.get_keys_with_prefix(s3_caliper_daily_path): s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_caliper_daily_path): raise BackgroundJobError('No timely S3 Caliper extracts found') else: app.logger.info('Falling back S3 Caliper extracts for yesterday') s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path) resolved_ddl_caliper_explode = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', canvas_caliper_explode_table=canvas_caliper_explode_table, loch_s3_caliper_explode_url=s3_caliper_daily_url, ) redshift.drop_external_schema(redshift_schema_lrs_external) if redshift.execute_ddl_script(resolved_ddl_caliper_explode): app.logger.info('Caliper explode schema and table successfully created.') else: raise BackgroundJobError('Caliper explode schema and table creation failed.') # Sanity-check event times from the latest Caliper batch against previously transformed event times. def datetime_from_query(query): response = redshift.fetch(query) timestamp = response and response[0] and response[0].get('timestamp') if not timestamp: raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}') if isinstance(timestamp, str): timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') return timestamp earliest_untransformed = datetime_from_query( f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}', ) latest_transformed = datetime_from_query( f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests', ) if not earliest_untransformed or not latest_transformed: return False timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds() lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE'] if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance: raise BackgroundJobError( f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, ' f'earliest untransformed {earliest_untransformed}', ) resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql') if redshift.execute_ddl_script(resolved_ddl_caliper_analytics): return 'Caliper analytics tables successfully created.' else: raise BackgroundJobError('Caliper analytics tables creation failed.')
def verify_migration(self, incremental_url, incremental_schema): redshift.drop_external_schema(incremental_schema) resolved_ddl_transient = resolve_sql_template( 'create_lrs_statements_table.template.sql', redshift_schema_lrs_external=incremental_schema, loch_s3_lrs_statements_path=incremental_url, ) if redshift.execute_ddl_script(resolved_ddl_transient): app.logger.info( f"LRS incremental schema '{incremental_schema}' created.") else: raise BackgroundJobError( f"LRS incremental schema '{incremental_schema}' creation failed." ) redshift_response = redshift.fetch( f'select count(*) from {incremental_schema}.statements') if redshift_response: redshift_statement_count = redshift_response[0].get('count') else: raise BackgroundJobError( f"Failed to verify LRS incremental schema '{incremental_schema}'." ) if redshift_statement_count == self.lrs_statement_count: app.logger.info( f'Verified {redshift_statement_count} rows migrated from LRS to {incremental_url}.' ) else: raise BackgroundJobError( f'Discrepancy between LRS ({self.lrs_statement_count} statements)' f' and {incremental_url} ({redshift_statement_count} statements).' )
def student_tables(app): """Use Postgres to mock the Redshift student schemas on local test runs.""" from nessie.externals import rds, redshift from nessie.lib.util import resolve_sql_template_string, resolve_sql_template rds.execute(resolve_sql_template('create_rds_indexes.template.sql')) fixture_path = f"{app.config['BASE_DIR']}/fixtures" with open(f'{fixture_path}/students.sql', 'r') as sql_file: student_sql = sql_file.read() params = {} for key in [ 'sis_api_drops_and_midterms_11667051_2178', 'sis_degree_progress_11667051', 'sis_student_api_11667051', 'sis_student_api_2345678901', ]: with open(f'{fixture_path}/{key}.json', 'r') as f: feed = f.read() if key.startswith('sis_student_api'): feed = json.dumps( json.loads(feed)['apiResponse']['response']['any'] ['students'][0]) params[key] = feed redshift.execute(resolve_sql_template_string(student_sql), params=params) yield for schema in ['asc_test', 'coe_test', 'student_test']: rds.execute(f'DROP SCHEMA {schema} CASCADE') redshift.execute(f'DROP SCHEMA {schema} CASCADE')
def verify_unloaded_count(self, url): url = path.split(url)[0] schema = app.config['REDSHIFT_SCHEMA_LRS'] resolved_ddl_transient_unloaded = resolve_sql_template( 'create_lrs_statements_unloaded_table.template.sql', redshift_schema_lrs_external=schema, loch_s3_lrs_statements_unloaded_path=url, ) if redshift.execute_ddl_script(resolved_ddl_transient_unloaded): app.logger.info( f"statements_unloaded table created in schema '{schema}'.") else: raise BackgroundJobError( f"Failed to create statements_unloaded table in schema '{schema}'." ) redshift_response = redshift.fetch( f'select count(*) from {schema}.statements_unloaded') if redshift_response: unloaded_statement_count = redshift_response[0].get('count') else: raise BackgroundJobError('Failed to get unloaded statement count.') if unloaded_statement_count == self.lrs_statement_count: app.logger.info( f'Verified {unloaded_statement_count} unloaded from LRS to {url}.' ) else: raise BackgroundJobError( f'Discrepancy between LRS ({self.lrs_statement_count} statements)' f' and {url} ({unloaded_statement_count} statements).')
def create_rds_indexes(self): resolved_ddl = resolve_sql_template('index_advisors.template.sql') if rds.execute(resolved_ddl): app.logger.info('Created RDS indexes for advisor schema.') else: raise BackgroundJobError( 'Failed to create RDS indexes for advisor schema.')
def index_appointment_advisors(self): resolved_ddl = resolve_sql_template( 'index_sis_appointment_advisors.template.sql') if rds.execute(resolved_ddl): app.logger.info('Indexed appointment advisors.') else: raise BackgroundJobError('Failed to index appointment advisors.')
def run(self): app.logger.info('Starting Canvas schema creation job...') canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_data_url = s3_prefix + canvas_path s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term( ) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_schema.template.sql', loch_s3_canvas_data_path_today=s3_canvas_data_url, loch_s3_canvas_data_path_current_term= s3_canvas_data_path_current_term, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas schema creation job completed.' else: raise BackgroundJobError('Canvas schema creation job failed.')
def create_advising_note_authors(self): resolved_ddl = resolve_sql_template( 'create_advising_note_authors.template.sql') if rds.execute(resolved_ddl): app.logger.info('Created advising note authors.') else: raise BackgroundJobError('Failed to create advising note authors.')
def create_schema(self): base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH'] external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) # Flatten E&I-sourced JSON files into two schema-friendly JSON files. notes = [] topics = [] for key in s3.get_keys_with_prefix(base_s3_key): if key.endswith('.json'): notes_json = s3.get_object_json(key) if notes_json and 'notes' in notes_json: notes += notes_json['notes'] for note in notes: topics += _extract_topics(note) if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \ and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'): # Create schema app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('E&I Advising Notes schema creation job failed.') else: raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
def index_advising_notes(self): resolved_ddl = resolve_sql_template( 'index_advising_notes.template.sql') if rds.execute(resolved_ddl): app.logger.info('Indexed advising notes.') else: raise BackgroundJobError('Failed to index advising notes.')
def verify_post_transform_statement_count(self, url): schema = app.config['REDSHIFT_SCHEMA_LRS'] resolved_ddl_transient = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', redshift_schema_lrs_external=schema, canvas_caliper_explode_table='caliper_statements_explode_transient', loch_s3_caliper_explode_url=url, ) if redshift.execute_ddl_script(resolved_ddl_transient): app.logger.info(f"caliper_statements_explode_transient table created in schema '{schema}'.") else: raise BackgroundJobError(f"Failed to create caliper_statements_explode_transient table in schema '{schema}'.") exploded_statement_response = redshift.fetch(f'select count(*) from {schema}.caliper_statements_explode_transient') if exploded_statement_response: exploded_statement_count = exploded_statement_response[0].get('count') else: raise BackgroundJobError(f"Failed to verify caliper_statements_explode_transient table in schema '{schema}'.") if exploded_statement_count == self.pre_transform_statement_count: app.logger.info(f'Verified {exploded_statement_count} transformed statements migrated to {url}.') else: raise BackgroundJobError( f'Discrepancy between pre-transform statement count ({self.pre_transform_statement_count} statements)' f' and transformed statements at {url} ({exploded_statement_count} statements).')
def run(self): app.logger.info(f'Starting BOAC analytics job...') resolved_ddl = resolve_sql_template('create_boac_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): return 'BOAC analytics creation job completed.' else: app.logger.error(f'BOAC analytics creation job failed.') return False
def create_internal_schema(self, external_schema, daily_path): bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] s3_data_url = f's3://{bucket}/{daily_path}/advising-notes' resolved_ddl = resolve_sql_template('create_sis_advising_notes_schema.template.sql', loch_s3_sis_notes_path_today=s3_data_url) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS Advising Notes schema creation job failed to load incremental data and create internal schema.')
def create_schema(self): app.logger.info('Executing SQL...') template_sql = 'create_edl_schema.template.sql' resolved_ddl = resolve_sql_template(template_sql) if not redshift.execute_ddl_script(resolved_ddl): raise BackgroundJobError('EDL SIS schema creation job failed.') # Create staging schema resolved_ddl_staging = resolve_sql_template( template_sql, redshift_schema_edl=f'{self.internal_schema}_staging', ) if redshift.execute_ddl_script(resolved_ddl_staging): app.logger.info(f"Schema '{self.internal_schema}_staging' found or created.") else: raise BackgroundJobError(f'{self.internal_schema} schema creation failed.') app.logger.info('Redshift EDL schema created.')
def create_indexes(self): resolved_ddl = resolve_sql_template( 'index_data_science_advising.template.sql') if rds.execute(resolved_ddl): app.logger.info('Created Data Science Advising RDS indexes.') else: raise BackgroundJobError( 'Data Science Advising schema creation job failed to create indexes.' )
def run(self): app.logger.info('Starting ASC schema creation job...') app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_asc_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info(f"Schema '{app.config['REDSHIFT_SCHEMA_ASC']}' found or created.") else: raise BackgroundJobError('ASC schema creation failed.') return True
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_sis_terms_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS terms schema creation job failed.')
def create_historical_tables(self, external_schema): resolved_ddl = resolve_sql_template( 'create_sis_advising_notes_historical_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'SIS Advising Notes schema creation job failed to load historical data.' )
def run(self): app.logger.info('Starting RDS index creation job...') app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_rds_indexes.template.sql') if rds.execute(resolved_ddl): app.logger.info('RDS indexes found or created.') return True else: raise BackgroundJobError('RDS index creation failed.')
def create_indexes(self): resolved_ddl = resolve_sql_template( 'index_e_i_advising_notes.template.sql') if rds.execute(resolved_ddl): app.logger.info('Created E&I Advising Notes RDS indexes.') else: raise BackgroundJobError( 'E&I Advising Notes schema creation job failed to create indexes.' )
def run(self): app.logger.info(f'Starting intermediate table generation job...') resolved_ddl = resolve_sql_template( 'create_intermediate_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): return 'Intermediate table creation job completed.' else: app.logger.error(f'Intermediate table creation job failed.') return False
def run(self): app.logger.info('Starting intermediate table generation job...') if app.config['FEATURE_FLAG_EDL_SIS_VIEWS']: sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] where_clause_exclude_withdrawn = "AND en.enrollment_status_reason <> 'WDRW'" else: sis_source_schema = app.config['REDSHIFT_SCHEMA_SIS'] where_clause_exclude_withdrawn = f"""/* Enrollment with no primary section is likely a withdrawal. */ AND EXISTS ( SELECT en0.term_id, en0.section_id, en0.ldap_uid FROM {app.config['REDSHIFT_SCHEMA_SIS']}.enrollments en0 JOIN {app.config['REDSHIFT_SCHEMA_INTERMEDIATE']}.course_sections crs0 ON crs0.sis_section_id = en0.section_id AND crs0.sis_term_id = en0.term_id WHERE en0.term_id = en.term_id AND en0.ldap_uid = en.ldap_uid AND crs0.sis_course_name = crs.sis_course_name AND crs0.sis_primary = TRUE AND en0.enrollment_status != 'D' AND en0.grade != 'W' )""" resolved_ddl_redshift = resolve_sql_template( 'create_intermediate_schema.template.sql', current_term_id=current_term_id(), redshift_schema_sis=sis_source_schema, where_clause_exclude_withdrawn=where_clause_exclude_withdrawn, ) if redshift.execute_ddl_script(resolved_ddl_redshift): app.logger.info('Redshift tables generated.') else: raise BackgroundJobError('Intermediate table creation job failed.') resolved_ddl_rds = resolve_sql_template('update_rds_indexes_sis.template.sql') if rds.execute(resolved_ddl_rds): app.logger.info('RDS indexes updated.') else: raise BackgroundJobError('Failed to update RDS indexes for intermediate schema.') return 'Intermediate table generation job completed.'
def test_execute_ddl_script(self, app, ensure_drop_schema): """Executes filled SQL template files one statement at a time.""" # TODO Test CREATE EXTERNAL SCHEMA and CREATE EXTERNAL TABLE statements. resolved_ddl = resolve_sql_template('test_db.template.sql') redshift.execute_ddl_script(resolved_ddl) schema = psycopg2.sql.Identifier(app.config['REDSHIFT_SCHEMA_BOAC']) result = redshift.fetch('SELECT COUNT(*) FROM {schema}.students', schema=schema) assert len(result) == 1 assert result[0]['count'] == 7
def run(self): app.logger.info('Starting Redshift user privileges job...') app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('restore_redshift_user_privileges.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info('Restored Redshift user privileges successfully.') else: raise BackgroundJobError('Redshift user privilege restoration failed.') return False return True
def create_indexes(self): resolved_ddl = resolve_sql_template( 'index_sis_advising_notes.template.sql', redshift_schema=app.config['REDSHIFT_SCHEMA_EDL'] if self.feature_flag_edl else app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES_INTERNAL'], ) if rds.execute(resolved_ddl): app.logger.info('Created SIS Advising Notes RDS indexes.') else: raise BackgroundJobError('SIS Advising Notes schema creation job failed to create indexes.')
def create_rds_tables_and_indexes(self): resolved_ddl = resolve_sql_template( 'index_oua_admissions.template.sql') if rds.execute(resolved_ddl): app.logger.info( 'Created OUA Slate RDS tables and indexes successfully') else: raise BackgroundJobError( 'OUA Slate schema creation job failed to create rds tables and indexes.' )
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_DATA_SCIENCE_ADVISING'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_data_science_advising_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'Data Science Advising schema creation job failed.')
def run(self): app.logger.info('Starting EDW schema creation job...') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_EDW'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_edw_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'EDW schema creation job completed.' else: raise BackgroundJobError('EDW schema creation job failed.')