def update_merged_feed_status(term_id, successes, failures): term_id = term_id or 'all' redshift.execute( 'DELETE FROM {schema}.merged_feed_status WHERE sid = ANY(%s) AND term_id = %s', schema=_schema(), params=((successes + failures), term_id), ) now = datetime.utcnow().isoformat() success_records = ['\t'.join([sid, term_id, 'success', now]) for sid in successes] failure_records = ['\t'.join([sid, term_id, 'failure', now]) for sid in failures] rows = success_records + failure_records s3_key = f'{get_s3_sis_api_daily_path()}/merged_feed_status.tsv' if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error uploading merged feed status updates to S3.') return query = resolve_sql_template_string( """ COPY {redshift_schema_metadata}.merged_feed_status FROM '{loch_s3_sis_api_data_path}/merged_feed_status.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SS'; """ ) if not redshift.execute(query): app.logger.error('Error copying merged feed status updates to Redshift.')
def student_tables(app): """Use Postgres to mock the Redshift student schemas on local test runs.""" from nessie.externals import rds, redshift from nessie.lib.util import resolve_sql_template_string, resolve_sql_template rds.execute(resolve_sql_template('create_rds_indexes.template.sql')) fixture_path = f"{app.config['BASE_DIR']}/fixtures" with open(f'{fixture_path}/students.sql', 'r') as sql_file: student_sql = sql_file.read() params = {} for key in [ 'sis_api_drops_and_midterms_11667051_2178', 'sis_degree_progress_11667051', 'sis_student_api_11667051', 'sis_student_api_2345678901', ]: with open(f'{fixture_path}/{key}.json', 'r') as f: feed = f.read() if key.startswith('sis_student_api'): feed = json.dumps( json.loads(feed)['apiResponse']['response']['any'] ['students'][0]) params[key] = feed redshift.execute(resolve_sql_template_string(student_sql), params=params) yield for schema in ['asc_test', 'coe_test', 'student_test']: rds.execute(f'DROP SCHEMA {schema} CASCADE') redshift.execute(f'DROP SCHEMA {schema} CASCADE')
def drop_staged_enrollment_term(term_id): redshift.execute( 'DELETE FROM {schema}.{table} WHERE term_id = %s', schema=psycopg2.sql.Identifier(staging_schema()), table=psycopg2.sql.Identifier('student_enrollment_terms'), params=(term_id, ), )
def run(self, csids=None): if app.config['STUDENT_V1_API_PREFERRED']: return self.run_v1(csids) if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info(f'Starting SIS student API import job for {len(csids)} students...') rows, failure_count = self.load_concurrently(csids) if (len(rows) == 0) and (failure_count > 0): raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.') s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {self.redshift_schema}_staging.sis_api_profiles'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{self.redshift_schema}_staging.sis_api_profiles', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'
def sis_note_tables(app): """Use Postgres to mock the Redshift SIS note schemas on local test runs.""" from nessie.externals import redshift internal_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES_INTERNAL'] redshift.execute(f'DROP SCHEMA IF EXISTS {internal_schema} CASCADE') redshift.execute(f'CREATE SCHEMA {internal_schema}') redshift.execute( f"""CREATE TABLE {internal_schema}.advising_note_attachments ( advising_note_id character varying(513), sid character varying(256), student_note_nr character varying(256), created_by character varying(256), user_file_name character varying(256), sis_file_name character varying(781) )""") redshift.execute( f"""INSERT INTO {internal_schema}.advising_note_attachments (advising_note_id, sid, student_note_nr, created_by, user_file_name, sis_file_name) VALUES ('12345678-00012', '12345678', '00012', '123', 'Sp_19_French_1B.pdf', '12345678_00012_1.pdf'), ('23456789-00003', '23456789', '00003', '123', 'Advising_Notes_test_attachment_document.png', '23456789_00003_1.png'), ('34567890-00014', '34567890', '00014', '123', 'notes_notes_notes.xls', '34567890_00014_2.xls') """) yield redshift.execute(f'DROP SCHEMA {internal_schema} CASCADE')
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS student API import job for {len(csids)} students...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})' ) feed = sis_student_api.get_student(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) else: failure_count += 1 app.logger.error( f'SIS student API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_profiles', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'
def run(self): app.logger.info('Starting merged non-advisee profile generation job.') app.logger.info('Cleaning up old data...') redshift.execute('VACUUM; ANALYZE;') status = self.generate_feeds() # Clean up the workbench. redshift.execute('VACUUM; ANALYZE;') app.logger.info('Vacuumed and analyzed.') return status
def test_schema_creation_drop(self, app, caplog, ensure_drop_schema): """Can create and drop schemata on a real Redshift instance.""" schema_name = app.config['REDSHIFT_SCHEMA_BOAC'] schema = psycopg2.sql.Identifier(schema_name) with capture_app_logs(app): result = redshift.execute('CREATE SCHEMA {schema}', schema=schema) assert result == 'CREATE SCHEMA' result = redshift.execute('CREATE SCHEMA {schema}', schema=schema) assert result is None assert f'Schema "{schema_name}" already exists' in caplog.text result = redshift.execute('DROP SCHEMA {schema}', schema=schema) assert result == 'DROP SCHEMA'
def run(self, term_id=None, backfill_new_students=False): app.logger.info( f'Starting merged profile generation job (term_id={term_id}, backfill={backfill_new_students}).' ) app.logger.info('Cleaning up old data...') redshift.execute('VACUUM; ANALYZE;') if backfill_new_students: status = '' previous_backfills = { row['sid'] for row in get_successfully_backfilled_students() } sids = {row['sid'] for row in get_all_student_ids()} old_sids = sids.intersection(previous_backfills) new_sids = sids.difference(previous_backfills) # Any students without a previous backfill will have feeds generated for all terms. Students with a previous # backfill get an update for the requested term only. if len(new_sids): app.logger.info( f'Found {len(new_sids)} new students, will backfill all terms.' ) ImportTermGpas().run(csids=new_sids) backfill_status = self.generate_feeds(sids=list(new_sids)) if not backfill_status: app.logger.warn( 'Backfill job aborted, will continue with non-backfill job.' ) backfill_status = 'aborted' else: app.logger.info(f'Backfill complete.') status += f'Backfill: {backfill_status}; non-backfill: ' app.logger.info( f'Will continue merged feed job for {len(old_sids)} previously backfilled students.' ) continuation_status = self.generate_feeds(sids=list(old_sids), term_id=term_id) if not continuation_status: return False status += continuation_status else: status = self.generate_feeds(term_id) # Clean up the workbench. redshift.execute('VACUUM; ANALYZE;') app.logger.info(f'Vacuumed and analyzed.') return status
def run(self): app.logger.info('Starting BOA manually added advisees import job...') feed = get_manually_added_advisees() if feed.get('error'): raise BackgroundJobError('Error on S3 upload: aborting job.') rows = [advisee['sid'].encode() for advisee in feed.get('feed')] s3_key = f'{get_s3_boa_api_daily_path()}/manually-added-advisees/manually-added-advisees.tsv' if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Copying data from S3 file to Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_advisee}.non_current_students; COPY {redshift_schema_advisee}.non_current_students FROM 's3://{s3_bucket}/{s3_key}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, s3_bucket=app.config['LOCH_S3_BUCKET'], s3_key=s3_key, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') status = f'Imported {len(rows)} non-current students.' app.logger.info( f'BOA manually added advisees import job completed: {status}') return status
def run(self, sids=None): if not sids: sids = [row['sid'] for row in get_unfetched_non_advisees()] app.logger.info( f'Starting SIS student API import job for {len(sids)} non-advisees...' ) with tempfile.TemporaryFile() as feed_file: saved_sids, failure_count = self.load_concurrently(sids, feed_file) if saved_sids: student_schema.truncate_staging_table( 'sis_api_profiles_hist_enr') student_schema.write_file_to_staging( 'sis_api_profiles_hist_enr', feed_file, len(saved_sids)) if saved_sids: staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles_hist_enr WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles_hist_enr); INSERT INTO {redshift_schema_student}.sis_api_profiles_hist_enr (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles_hist_enr); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles_hist_enr; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') return f'SIS student API non-advisee import job completed: {len(saved_sids)} succeeded, {failure_count} failed.'
def unload_to_etl(self, schema, bucket, timestamped=True): s3_url = 's3://' + bucket + '/' + app.config[ 'LRS_CANVAS_INCREMENTAL_ETL_PATH_REDSHIFT'] if timestamped: s3_url += '/' + localize_datetime( datetime.now()).strftime('%Y/%m/%d/statements_%Y%m%d_%H%M%S_') else: s3_url += '/statements' redshift_iam_role = app.config['REDSHIFT_IAM_ROLE'] if not redshift.execute( f""" UNLOAD ('SELECT statement FROM {schema}.statements') TO '{s3_url}' IAM_ROLE '{redshift_iam_role}' ENCRYPTED DELIMITER AS ' ' NULL AS '' ALLOWOVERWRITE PARALLEL OFF MAXFILESIZE 1 gb """, ): raise BackgroundJobError( f'Error executing Redshift unload to {s3_url}.') self.verify_unloaded_count(s3_url)
def create_background_job_status(job_id): sql = """INSERT INTO {schema}.background_job_status (job_id, status, instance_id, created_at, updated_at) VALUES (%s, 'started', %s, current_timestamp, current_timestamp) """ return redshift.execute( sql, params=(job_id, _instance_id()), schema=_schema(), )
def create_canvas_sync_status(job_id, filename, canvas_table, source_url): sql = """INSERT INTO {schema}.canvas_sync_job_status (job_id, filename, canvas_table, source_url, status, instance_id, created_at, updated_at) VALUES (%s, %s, %s, %s, 'created', %s, current_timestamp, current_timestamp) """ return redshift.execute( sql, params=(job_id, filename, canvas_table, source_url, _instance_id()), schema=_schema(), )
def metadata_db(app): """Use Postgres to mock the Redshift metadata schema on local test runs.""" from nessie.externals import redshift schema = app.config['REDSHIFT_SCHEMA_METADATA'] redshift.execute(f'DROP SCHEMA IF EXISTS {schema} CASCADE') redshift.execute(f'CREATE SCHEMA IF NOT EXISTS {schema}') redshift.execute( f"""CREATE TABLE IF NOT EXISTS {schema}.background_job_status ( job_id VARCHAR NOT NULL, status VARCHAR NOT NULL, instance_id VARCHAR, details VARCHAR(4096), created_at TIMESTAMP NOT NULL, updated_at TIMESTAMP NOT NULL )""") redshift.execute( f"""CREATE TABLE IF NOT EXISTS {schema}.canvas_sync_job_status ( job_id VARCHAR NOT NULL, filename VARCHAR NOT NULL, canvas_table VARCHAR NOT NULL, source_url VARCHAR NOT NULL, source_size BIGINT, destination_url VARCHAR, destination_size BIGINT, status VARCHAR NOT NULL, details VARCHAR, instance_id VARCHAR, created_at TIMESTAMP NOT NULL, updated_at TIMESTAMP NOT NULL )""") redshift.execute( f"""CREATE TABLE IF NOT EXISTS {schema}.canvas_synced_snapshots ( filename VARCHAR NOT NULL, canvas_table VARCHAR NOT NULL, url VARCHAR NOT NULL, size BIGINT NOT NULL, created_at TIMESTAMP NOT NULL, deleted_at TIMESTAMP )""")
def run(self, term_id=None): app.logger.info('Starting merged profile generation job.') # This version of the code will always generate feeds for all-terms and all-advisees, but we # expect support for term-specific or backfill-specific feed generation will return soon. if term_id != 'all': app.logger.warn( f'Term-specific generation was requested for {term_id}, but all terms will be generated.' ) app.logger.info('Cleaning up old data...') redshift.execute('VACUUM; ANALYZE;') status = self.generate_feeds() # Clean up the workbench. redshift.execute('VACUUM; ANALYZE;') app.logger.info('Vacuumed and analyzed.') return status
def create_canvas_snapshot(key, size): canvas_table, filename = key.split('/')[-2:] url = s3.build_s3_url(key, credentials=False) sql = """INSERT INTO {schema}.canvas_synced_snapshots (filename, canvas_table, url, size, created_at) VALUES (%s, %s, %s, %s, current_timestamp)""" return redshift.execute( sql, params=(filename, canvas_table, url, size), schema=_schema(), )
def update_background_job_status(job_id, status, details=None): if details: details = details[:4096] sql = """UPDATE {schema}.background_job_status SET status=%s, updated_at=current_timestamp, details=%s WHERE job_id=%s""" return redshift.execute( sql, params=(status, details, job_id), schema=_schema(), )
def import_advisor_attributes(self): csid_results = redshift.fetch( resolve_sql_template_string( 'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students' ), ) csids = [r['advisor_sid'] for r in csid_results] all_attributes = calnet.client(app).search_csids(csids) if len(csids) != len(all_attributes): ldap_csids = [person['csid'] for person in all_attributes] missing = set(csids) - set(ldap_csids) app.logger.warning( f'Looked for {len(csids)} advisor CSIDs but only found {len(all_attributes)} : missing {missing}' ) advisor_rows = [] total_count = len(all_attributes) for index, a in enumerate(all_attributes): sid = a['csid'] app.logger.info( f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {total_count})' ) first_name, last_name = calnet.split_sortable_name(a) data = [ a['uid'], sid, first_name, last_name, a['title'], calnet.get_dept_code(a), a['email'], a['campus_email'], ] advisor_rows.append(encoded_tsv_row(data)) s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv' app.logger.info( f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(advisor_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes; COPY {redshift_schema_advisor_internal}.advisor_attributes FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False
def unload_to_etl(self, schema, bucket): timestamp_path = localize_datetime( datetime.now()).strftime('%Y/%m/%d/statements_%Y%m%d_%H%M%S_') credentials = ';'.join([ f"aws_access_key_id={app.config['AWS_ACCESS_KEY_ID']}", f"aws_secret_access_key={app.config['AWS_SECRET_ACCESS_KEY']}", ]) return redshift.execute(f""" UNLOAD ('SELECT statement FROM {schema}.statements') TO 's3://{bucket}/{app.config['LRS_CANVAS_INCREMENTAL_ETL_PATH_REDSHIFT']}/{timestamp_path}' CREDENTIALS '{credentials}' DELIMITER AS ' ' NULL AS '' ALLOWOVERWRITE PARALLEL OFF MAXFILESIZE 1 gb """)
def unload_enrollment_terms(term_ids): query = resolve_sql_template_string( """ UNLOAD ('SELECT *, GETDATE() AS analytics_generated_at FROM {schema}.student_enrollment_terms WHERE term_id=ANY(\''{{{term_ids}}}\'')') TO '{loch_s3_boac_analytics_incremental_path}/student_enrollment_terms' IAM_ROLE '{redshift_iam_role}' ENCRYPTED DELIMITER AS '\\t' ALLOWOVERWRITE GZIP; """, schema=student_schema(), term_ids=','.join(term_ids), ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift unload: aborting job.')
def update_canvas_sync_status(job_id, key, status, **kwargs): filename = key.split('/')[-1] destination_url = s3.build_s3_url(key, credentials=False) sql = """UPDATE {schema}.canvas_sync_job_status SET destination_url=%s, status=%s, updated_at=current_timestamp""" params = [destination_url, status] for key in ['details', 'source_size', 'destination_size']: if kwargs.get(key): sql += f', {key}=%s' params.append(kwargs[key]) sql += ' WHERE job_id=%s AND filename=%s' params += [job_id, filename] return redshift.execute( sql, params=tuple(params), schema=_schema(), )
def upload_to_staging(self, table): rows = self.rows[table] s3_key = f'{get_s3_sis_api_daily_path()}/staging_{table}.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/staging_{table}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=self.staging_schema, table=table, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False
def upload_to_staging(table, rows, term_id=None): if term_id: tsv_filename = f'staging_{table}_{term_id}.tsv' else: tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/{tsv_filename}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=staging_schema(), table=table, tsv_filename=tsv_filename, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def upload_file_to_staging(table, term_file, row_count, term_id): tsv_filename = f'staging_{table}_{term_id}.tsv' if term_id else f'staging_{table}.tsv' s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}') # Be kind; rewind term_file.seek(0) if not s3.upload_data(term_file, s3_key): raise BackgroundJobError( f'Failed upload {row_count} records to s3:{s3_key}. Aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/{tsv_filename}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=staging_schema(), table=table, tsv_filename=tsv_filename, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def truncate_staging_table(table): redshift.execute( 'TRUNCATE {schema}.{table}', schema=psycopg2.sql.Identifier(staging_schema()), table=psycopg2.sql.Identifier(table), )
def update_redshift_academic_standing(self): redshift.execute( f"""TRUNCATE {student_schema()}.academic_standing; INSERT INTO {student_schema()}.academic_standing SELECT sid, term_id, acad_standing_action, acad_standing_status, action_date FROM {self.redshift_schema_sis}.academic_standing;""", )
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = str(enrollment.get('user_id')) last_activity_at = str( enrollment.get('last_activity_at') or '') rows.append('\t'.join([ str(course_id), user_id, str(term_id), last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; COPY {redshift_schema_student}_staging.canvas_api_enrollments FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ'; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def run(self): app.logger.info('Starting ASC profile generation job...') asc_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)', schema=asc_schema_identifier, ) profile_rows = [] sids_for_inactive_deletion = [] for sid, rows_for_student in groupby(asc_rows, operator.itemgetter('sid')): rows_for_student = list(rows_for_student) # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not # a team membership, a bit of brutal simplification is needed. Students who are active in at least # one sport have inactive team memberships dropped. any_active_athletics = reduce( operator.or_, [r['active'] for r in rows_for_student], False) if any_active_athletics: rows_for_student = [r for r in rows_for_student if r['active']] sids_for_inactive_deletion.append(sid) athletics_profile = { 'athletics': [], 'inIntensiveCohort': rows_for_student[0]['intensive'], 'isActiveAsc': rows_for_student[0]['active'], 'statusAsc': rows_for_student[0]['status_asc'], } for row in rows_for_student: athletics_profile['athletics'].append({ 'groupCode': row['group_code'], 'groupName': row['group_name'], 'name': row['group_name'], 'teamCode': row['team_code'], 'teamName': row['team_name'], }) profile_rows.append( encoded_tsv_row([sid, json.dumps(athletics_profile)])) s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.student_profiles; COPY {redshift_schema_asc}.student_profiles FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False with rds.transaction() as transaction: if self.refresh_rds_indexes(asc_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') if sids_for_inactive_deletion: redshift.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) rds.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) return 'ASC profile generation complete.'
def run(self): app.logger.info('Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info('COE external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('COE external schema creation failed.') coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), 'sat1read': row_for_student.get('sat1read'), 'sat1math': row_for_student.get('sat1math'), 'sat2math': row_for_student.get('sat2math'), 'inMet': row_for_student.get('in_met'), 'gradTerm': row_for_student.get('grad_term'), 'gradYear': row_for_student.get('grad_year'), 'probation': row_for_student.get('probation'), 'status': row_for_student.get('status'), } profile_rows.append(encoded_tsv_row([sid, json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'COE internal schema created.'