def run(self): app.logger.info(f'Starting student schema creation job...') app.logger.info(f'Executing SQL...') resolved_ddl = resolve_sql_template( 'create_student_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info( f"Schema '{app.config['REDSHIFT_SCHEMA_STUDENT']}' found or created." ) else: app.logger.error(f'Student schema creation failed.') return False resolved_ddl_staging = resolve_sql_template( 'create_student_schema.template.sql', redshift_schema_student=app.config['REDSHIFT_SCHEMA_STUDENT'] + '_staging', ) if redshift.execute_ddl_script(resolved_ddl_staging): app.logger.info( f"Schema '{app.config['REDSHIFT_SCHEMA_STUDENT']}_staging' found or created." ) else: app.logger.error(f'Student staging schema creation failed.') return False return True
def run(self): app.logger.info('Start generating canvas caliper analytics') redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER'] redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS'] canvas_caliper_explode_table = 'canvas_caliper_explode' # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch # may have been delivered before last midnight UTC. s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path() if not s3.get_keys_with_prefix(s3_caliper_daily_path): s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_caliper_daily_path): raise BackgroundJobError('No timely S3 Caliper extracts found') else: app.logger.info('Falling back S3 Caliper extracts for yesterday') s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path) resolved_ddl_caliper_explode = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', canvas_caliper_explode_table=canvas_caliper_explode_table, loch_s3_caliper_explode_url=s3_caliper_daily_url, ) redshift.drop_external_schema(redshift_schema_lrs_external) if redshift.execute_ddl_script(resolved_ddl_caliper_explode): app.logger.info('Caliper explode schema and table successfully created.') else: raise BackgroundJobError('Caliper explode schema and table creation failed.') # Sanity-check event times from the latest Caliper batch against previously transformed event times. def datetime_from_query(query): response = redshift.fetch(query) timestamp = response and response[0] and response[0].get('timestamp') if not timestamp: raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}') if isinstance(timestamp, str): timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') return timestamp earliest_untransformed = datetime_from_query( f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}', ) latest_transformed = datetime_from_query( f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests', ) if not earliest_untransformed or not latest_transformed: return False timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds() lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE'] if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance: raise BackgroundJobError( f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, ' f'earliest untransformed {earliest_untransformed}', ) resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql') if redshift.execute_ddl_script(resolved_ddl_caliper_analytics): return 'Caliper analytics tables successfully created.' else: raise BackgroundJobError('Caliper analytics tables creation failed.')
def test_execute_ddl_script(self, app, ensure_drop_schema): """Executes filled SQL template files one statement at a time.""" # TODO Test CREATE EXTERNAL SCHEMA and CREATE EXTERNAL TABLE statements. resolved_ddl = resolve_sql_template('test_db.template.sql') redshift.execute_ddl_script(resolved_ddl) schema = psycopg2.sql.Identifier(app.config['REDSHIFT_SCHEMA_BOAC']) result = redshift.fetch('SELECT COUNT(*) FROM {schema}.students', schema=schema) assert len(result) == 1 assert result[0]['count'] == 7
def create_schema(self): base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH'] external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) # Flatten E&I-sourced JSON files into two schema-friendly JSON files. notes = [] topics = [] for key in s3.get_keys_with_prefix(base_s3_key): if key.endswith('.json'): notes_json = s3.get_object_json(key) if notes_json and 'notes' in notes_json: notes += notes_json['notes'] for note in notes: topics += _extract_topics(note) if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \ and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'): # Create schema app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('E&I Advising Notes schema creation job failed.') else: raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
def run(self): app.logger.info('Starting intermediate table generation job...') sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config[ 'FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS'] resolved_ddl_redshift = resolve_sql_template( 'create_intermediate_schema.template.sql', current_term_id=current_term_id(), redshift_schema_sis=sis_source_schema, ) if redshift.execute_ddl_script(resolved_ddl_redshift): app.logger.info('Redshift tables generated.') else: raise BackgroundJobError('Intermediate table creation job failed.') resolved_ddl_rds = resolve_sql_template( 'update_rds_indexes_sis.template.sql') if rds.execute(resolved_ddl_rds): app.logger.info('RDS indexes updated.') else: raise BackgroundJobError( 'Failed to update RDS indexes for intermediate schema.') return 'Intermediate table generation job completed.'
def run(self): app.logger.info('Starting Canvas schema creation job...') canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_data_url = s3_prefix + canvas_path s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term( ) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_schema.template.sql', loch_s3_canvas_data_path_today=s3_canvas_data_url, loch_s3_canvas_data_path_current_term= s3_canvas_data_path_current_term, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas schema creation job completed.' else: raise BackgroundJobError('Canvas schema creation job failed.')
def verify_post_transform_statement_count(self, url): schema = app.config['REDSHIFT_SCHEMA_LRS'] resolved_ddl_transient = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', redshift_schema_lrs_external=schema, canvas_caliper_explode_table='caliper_statements_explode_transient', loch_s3_caliper_explode_url=url, ) if redshift.execute_ddl_script(resolved_ddl_transient): app.logger.info(f"caliper_statements_explode_transient table created in schema '{schema}'.") else: raise BackgroundJobError(f"Failed to create caliper_statements_explode_transient table in schema '{schema}'.") exploded_statement_response = redshift.fetch(f'select count(*) from {schema}.caliper_statements_explode_transient') if exploded_statement_response: exploded_statement_count = exploded_statement_response[0].get('count') else: raise BackgroundJobError(f"Failed to verify caliper_statements_explode_transient table in schema '{schema}'.") if exploded_statement_count == self.pre_transform_statement_count: app.logger.info(f'Verified {exploded_statement_count} transformed statements migrated to {url}.') else: raise BackgroundJobError( f'Discrepancy between pre-transform statement count ({self.pre_transform_statement_count} statements)' f' and transformed statements at {url} ({exploded_statement_count} statements).')
def verify_migration(self, incremental_url, incremental_schema): redshift.drop_external_schema(incremental_schema) resolved_ddl_transient = resolve_sql_template( 'create_lrs_statements_table.template.sql', redshift_schema_lrs_external=incremental_schema, loch_s3_lrs_statements_path=incremental_url, ) if redshift.execute_ddl_script(resolved_ddl_transient): app.logger.info( f"LRS incremental schema '{incremental_schema}' created.") else: raise BackgroundJobError( f"LRS incremental schema '{incremental_schema}' creation failed." ) redshift_response = redshift.fetch( f'select count(*) from {incremental_schema}.statements') if redshift_response: redshift_statement_count = redshift_response[0].get('count') else: raise BackgroundJobError( f"Failed to verify LRS incremental schema '{incremental_schema}'." ) if redshift_statement_count == self.lrs_statement_count: app.logger.info( f'Verified {redshift_statement_count} rows migrated from LRS to {incremental_url}.' ) else: raise BackgroundJobError( f'Discrepancy between LRS ({self.lrs_statement_count} statements)' f' and {incremental_url} ({redshift_statement_count} statements).' )
def verify_unloaded_count(self, url): url = path.split(url)[0] schema = app.config['REDSHIFT_SCHEMA_LRS'] resolved_ddl_transient_unloaded = resolve_sql_template( 'create_lrs_statements_unloaded_table.template.sql', redshift_schema_lrs_external=schema, loch_s3_lrs_statements_unloaded_path=url, ) if redshift.execute_ddl_script(resolved_ddl_transient_unloaded): app.logger.info( f"statements_unloaded table created in schema '{schema}'.") else: raise BackgroundJobError( f"Failed to create statements_unloaded table in schema '{schema}'." ) redshift_response = redshift.fetch( f'select count(*) from {schema}.statements_unloaded') if redshift_response: unloaded_statement_count = redshift_response[0].get('count') else: raise BackgroundJobError('Failed to get unloaded statement count.') if unloaded_statement_count == self.lrs_statement_count: app.logger.info( f'Verified {unloaded_statement_count} unloaded from LRS to {url}.' ) else: raise BackgroundJobError( f'Discrepancy between LRS ({self.lrs_statement_count} statements)' f' and {url} ({unloaded_statement_count} statements).')
def create_schema(self): app.logger.info('Executing SQL...') template_sql = 'create_edl_schema.template.sql' resolved_ddl = resolve_sql_template(template_sql) if not redshift.execute_ddl_script(resolved_ddl): raise BackgroundJobError('EDL SIS schema creation job failed.') # Create staging schema resolved_ddl_staging = resolve_sql_template( template_sql, redshift_schema_edl=f'{self.internal_schema}_staging', ) if redshift.execute_ddl_script(resolved_ddl_staging): app.logger.info(f"Schema '{self.internal_schema}_staging' found or created.") else: raise BackgroundJobError(f'{self.internal_schema} schema creation failed.') app.logger.info('Redshift EDL schema created.')
def run(self): app.logger.info(f'Starting BOAC analytics job...') resolved_ddl = resolve_sql_template('create_boac_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): return 'BOAC analytics creation job completed.' else: app.logger.error(f'BOAC analytics creation job failed.') return False
def create_internal_schema(self, external_schema, daily_path): bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] s3_data_url = f's3://{bucket}/{daily_path}/advising-notes' resolved_ddl = resolve_sql_template('create_sis_advising_notes_schema.template.sql', loch_s3_sis_notes_path_today=s3_data_url) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS Advising Notes schema creation job failed to load incremental data and create internal schema.')
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_sis_terms_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS terms schema creation job failed.')
def create_historical_tables(self, external_schema): resolved_ddl = resolve_sql_template( 'create_sis_advising_notes_historical_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'SIS Advising Notes schema creation job failed to load historical data.' )
def run(self): app.logger.info(f'Starting intermediate table generation job...') resolved_ddl = resolve_sql_template( 'create_intermediate_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): return 'Intermediate table creation job completed.' else: app.logger.error(f'Intermediate table creation job failed.') return False
def run(self): app.logger.info('Starting ASC schema creation job...') app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_asc_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info(f"Schema '{app.config['REDSHIFT_SCHEMA_ASC']}' found or created.") else: raise BackgroundJobError('ASC schema creation failed.') return True
def run(self): app.logger.info('Starting Redshift user privileges job...') app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('restore_redshift_user_privileges.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info('Restored Redshift user privileges successfully.') else: raise BackgroundJobError('Redshift user privilege restoration failed.') return False return True
def run(self): app.logger.info(f'Starting Canvas schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_canvas_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info(f'Canvas schema creation job completed.') return verify_external_schema(external_schema, resolved_ddl) else: app.logger.error(f'Canvas schema creation job failed.') return False
def run(self): app.logger.info('Starting EDW schema creation job...') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_EDW'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_edw_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'EDW schema creation job completed.' else: raise BackgroundJobError('EDW schema creation job failed.')
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_DATA_SCIENCE_ADVISING'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_data_science_advising_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'Data Science Advising schema creation job failed.')
def run(self): success = 0 failure = 0 berkeleyx_tenants = { 'edx': { 'database': 'berkeleyx_prod_ext', 'instance': 'prod-analytics', }, 'edge': { 'database': 'berkeleyx_prod_edge_ext', 'instance': 'prod-edge-analytics', }, } for tenant, value in berkeleyx_tenants.items(): app.logger.info( f'Starting Berkeleyx schema creation job for {tenant}...') berkeleyx_data_path = 's3://{}/{}/{}'.format( app.config['LOCH_EDX_S3_BUCKET'], app.config['LOCH_EDX_S3_WEEKLY_DATA_PATH'], value['instance'], ) berkeleyx_transaction_log_path = 's3://{}/{}/{}'.format( app.config['LOCH_EDX_S3_BUCKET'], app.config['LOCH_EDX_S3_TRANSACTION_LOG_PATH'], tenant, ) external_schema = value['database'] + '_' + app.config[ 'LOCH_EDX_NESSIE_ENV'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_berkeleyx_schema.template.sql', loch_s3_berkeleyx_data_path=berkeleyx_data_path, loch_s3_berkeleyx_transaction_log_path= berkeleyx_transaction_log_path, redshift_berkeleyx_ext_schema=external_schema, ) if redshift.execute_ddl_script(resolved_ddl): app.logger.info( f'BerkeleyX schema {external_schema} creation completed.') success += 1 else: app.logger.error( f'BerkeleyX schema creation {external_schema} failed.') failure += 1 if failure > 0: raise BackgroundJobError('Berkeleyx Schema creation jobs failed') else: app.logger.info( 'Bekreleyx schema creation jobs completed successfully') return True
def create_external_schema(self): if not self.feature_flag_edl: # External schema is necessary when pulling term definitions from S3. redshift.drop_external_schema(self.redshift_schema) sql_template = 'create_terms_schema.template.sql' app.logger.info(f'Executing {sql_template}...') resolved_ddl = resolve_sql_template(sql_template) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(self.redshift_schema, resolved_ddl) else: raise BackgroundJobError( 'SIS terms schema creation job failed.')
def run(self): app.logger.info(f'Starting metadata schema creation job...') app.logger.info(f'Executing SQL...') resolved_ddl = resolve_sql_template( 'create_metadata_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info( f"Schema '{app.config['REDSHIFT_SCHEMA_METADATA']}' found or created." ) else: app.logger.error(f'Metadata schema creation failed.') return False return True
def run(self): app.logger.info('Starting Gradescope schema creation job...') app.logger.info('Executing SQL...') # Add frequency of ingest and dynamic path creation logic once ingest mechanisms are confirmed # For now adding a top level hierarchy to catalog bulk snapshot. external_schema = app.config['REDSHIFT_SCHEMA_GRADESCOPE'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_gradescope_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Gradescope schema creation job completed.' else: raise BackgroundJobError('Gradescope schema creation job failed.')
def run(self): app.logger.info('Starting SIS schema creation job...') if not self.update_manifests(): app.logger.info('Error updating manifests, will not execute schema creation SQL') return False app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_SIS'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_sis_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS schema creation job failed.') return 'SIS schema creation job completed.'
def run(self): app.logger.info('Starting OUA Slate schema creation job...') app.logger.info('Executing SQL...') self.migrate_oua_sftp_data() external_schema = app.config['REDSHIFT_SCHEMA_OUA'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_oua_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) self.create_rds_tables_and_indexes() app.logger.info('OUA Slate RDS indexes created.') return 'OUA schema creation job completed.' else: raise BackgroundJobError('OUA Slate schema creation job failed.')
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_ASC_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) asc_data_sftp_path = '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", app.config['LOCH_S3_ASC_DATA_SFTP_PATH'], ]) resolved_ddl = resolve_sql_template( 'create_asc_advising_notes_schema.template.sql', asc_data_sftp_path=asc_data_sftp_path, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('ASC Advising Notes schema creation job failed.')
def run(self): app.logger.info('Starting YCBM schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_YCBM'] redshift.drop_external_schema(external_schema) sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config['FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS'] resolved_ddl = resolve_sql_template( 'create_ycbm_schema.template.sql', redshift_schema_sis=sis_source_schema, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'YCBM schema creation job completed.' else: raise BackgroundJobError('YCBM schema creation job failed.')