def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info(f'Starting SIS student API import job for {len(csids)} students...') rows, failure_count = self.load(csids) if (len(rows) == 0) and (failure_count > 0): raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.') s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {student_schema()}_staging.sis_api_profiles'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.sis_api_profiles', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema}_staging.sis_api_profiles); INSERT INTO {redshift_schema}.sis_api_profiles (SELECT * FROM {redshift_schema}_staging.sis_api_profiles); TRUNCATE {redshift_schema}_staging.sis_api_profiles; """, redshift_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'
def refresh_rds_indexes(self, sids, transaction): if not (self._delete_rds_rows('student_academic_status', sids, transaction) and self._refresh_rds_academic_status(transaction) and self._delete_rds_rows('student_holds', sids, transaction) and self._refresh_rds_holds(transaction) and self._delete_rds_rows('student_names', sids, transaction) and self._refresh_rds_names(transaction) and self._delete_rds_rows('student_majors', sids, transaction) and self._refresh_rds_majors(transaction) and self._delete_rds_rows('student_profiles', sids, transaction) and self._refresh_rds_profiles(transaction) and self._delete_rds_rows('intended_majors', sids, transaction) and self._refresh_rds_intended_majors(transaction) and self._delete_rds_rows('academic_standing', sids, transaction) and self._refresh_rds_academic_standing(transaction) and self._delete_rds_rows('minors', sids, transaction) and self._refresh_rds_minors(transaction) and self._index_rds_email_address(transaction) and self._index_rds_entering_term(transaction) and refresh_rds_demographics(self.rds_schema, self.rds_dblink_to_redshift, student_schema(), transaction)): return False return True
def run(self, sids=None): if not sids: sids = [row['sid'] for row in get_unfetched_non_advisees()] app.logger.info( f'Starting SIS student API import job for {len(sids)} non-advisees...' ) with tempfile.TemporaryFile() as feed_file: saved_sids, failure_count = self.load_concurrently(sids, feed_file) if saved_sids: sis_profiles_hist_enr = student_schema_table( 'sis_profiles_hist_enr') truncate_staging_table(sis_profiles_hist_enr) write_file_to_staging(sis_profiles_hist_enr, feed_file, len(saved_sids)) if saved_sids: staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema}.{sis_profiles_hist_enr} WHERE sid IN (SELECT sid FROM {redshift_schema}_staging.{sis_profiles_hist_enr}); INSERT INTO {redshift_schema}.{sis_profiles_hist_enr} (SELECT * FROM {redshift_schema}_staging.{sis_profiles_hist_enr}); TRUNCATE {redshift_schema}_staging.{sis_profiles_hist_enr}; """, redshift_schema=student_schema(), sis_profiles_hist_enr=sis_profiles_hist_enr, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') return f'SIS student API non-advisee import job completed: {len(saved_sids)} succeeded, {failure_count} failed.'
def unload_enrollment_terms(term_ids): query = resolve_sql_template_string( """ UNLOAD ('SELECT *, GETDATE() AS analytics_generated_at FROM {schema}.student_enrollment_terms WHERE term_id=ANY(\''{{{term_ids}}}\'')') TO '{loch_s3_boac_analytics_incremental_path}/student_enrollment_terms' IAM_ROLE '{redshift_iam_role}' ENCRYPTED DELIMITER AS '\\t' ALLOWOVERWRITE GZIP; """, schema=student_schema(), term_ids=','.join(term_ids), ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift unload: aborting job.')
def refresh_from_staging(table, term_id, sids, transaction, truncate_staging=True): # If our job is restricted to a particular term id or set of sids, then drop rows from the destination table # matching those restrictions. If there are no restrictions, the entire destination table can be truncated. refresh_conditions = [] refresh_params = [] if term_id: refresh_conditions.append('term_id = %s') refresh_params.append(term_id) if sids: refresh_conditions.append('sid = ANY(%s)') refresh_params.append(sids) def _success(): app.logger.info( f'Populated {student_schema()}.{table} from staging schema.') def _rollback(): transaction.rollback() raise BackgroundJobError( f'Failed to populate table {student_schema()}.{table} from staging schema.' ) if not refresh_conditions: transaction.execute( 'TRUNCATE {schema}.{table}', schema=psycopg2.sql.Identifier(student_schema()), table=psycopg2.sql.Identifier(table), ) app.logger.info( f'Truncated destination table {student_schema()}.{table}.') _success() if transaction.execute( 'INSERT INTO {schema}.{table} (SELECT * FROM {staging_schema}.{table})', schema=psycopg2.sql.Identifier(student_schema()), staging_schema=psycopg2.sql.Identifier(staging_schema()), table=psycopg2.sql.Identifier(table), ) else _rollback() else: delete_sql = 'DELETE FROM {schema}.{table} WHERE ' + ' AND '.join( refresh_conditions) transaction.execute( delete_sql, schema=psycopg2.sql.Identifier(student_schema()), table=psycopg2.sql.Identifier(table), params=tuple(refresh_params), ) app.logger.info(f""" Deleted existing rows from destination table {student_schema()}.{table} ' term_id={term_id or 'all'}, {len(sids) if sids else 'all'} sids). """) insert_sql = 'INSERT INTO {schema}.{table} (SELECT * FROM {staging_schema}.{table} WHERE ' + ' AND '.join( refresh_conditions) + ')' _success() if transaction.execute( insert_sql, schema=psycopg2.sql.Identifier(student_schema()), staging_schema=psycopg2.sql.Identifier(staging_schema()), table=psycopg2.sql.Identifier(table), params=tuple(refresh_params), ) else _rollback() # The staging table can now be truncated, unless we're running a job distributed between workers. if truncate_staging: transaction.execute( 'TRUNCATE {schema}.{table}', schema=psycopg2.sql.Identifier(staging_schema()), table=psycopg2.sql.Identifier(table), ) app.logger.info(f'Truncated staging table {staging_schema()}.{table}.')
def run(self, load_mode='new'): all_sids = [row['sid'] for row in get_all_student_ids()] previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()} if load_mode == 'new': sids = list(set(all_sids).difference(previous_backfills)) elif load_mode == 'batch': new_sids = list(set(all_sids).difference(previous_backfills)) limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids) if limit > 0: oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)] sids = new_sids + oldest_backfills else: sids = new_sids elif load_mode == 'all': sids = all_sids app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...') rows = { 'term_gpas': [], 'last_registrations': [], 'api_demographics': [], } successes, failures = self.get_registration_data_per_sids(rows, sids) if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0): raise BackgroundJobError('Failed to import registration histories: aborting job.') for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv' app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.student_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.student_{table_key}); INSERT INTO {student_schema}.student_{table_key} (SELECT * FROM {student_schema}_staging.student_{table_key}); TRUNCATE TABLE {student_schema}_staging.student_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error inserting staging entries into destination: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Failed to refresh RDS indexes.') update_registration_import_status(successes, failures) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. # # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less # likely to be triggered.) if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting import of historical registration data for {len(sids)} students...' ) redshift.execute('VACUUM; ANALYZE;') rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.get_registration_data_per_sids( rows, sids, include_demographics=False) for key in rows.keys(): if len(rows[key]) > 0: s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.' ) if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( f'Error during S3 upload: {s3_key}. Aborting job.') staging_table = f'{student_schema()}_staging.hist_enr_{key}' if not redshift.execute(f'TRUNCATE {staging_table}'): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') app.logger.info( f'Populate {staging_table} (Redshift table) with s3:{s3_key}' ) if not redshift.copy_tsv_from_s3(staging_table, s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') app.logger.info( f'Insert student data into {student_schema()}.hist_enr_{key}' ) staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key}); INSERT INTO {student_schema}.hist_enr_{table_key} (SELECT * FROM {student_schema}_staging.hist_enr_{table_key}); TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) redshift.execute('VACUUM; ANALYZE;') return ( f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.' )