def run(self, csids=None): if app.config['STUDENT_V1_API_PREFERRED']: return self.run_v1(csids) if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info(f'Starting SIS student API import job for {len(csids)} students...') rows, failure_count = self.load_concurrently(csids) if (len(rows) == 0) and (failure_count > 0): raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.') s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {self.redshift_schema}_staging.sis_api_profiles'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{self.redshift_schema}_staging.sis_api_profiles', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS student API import job for {len(csids)} students...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})' ) feed = sis_student_api.get_student(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) else: failure_count += 1 app.logger.error( f'SIS student API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_profiles', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'
def _upload_file_to_staging(self, table, _file): tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {table} feeds in S3: {s3_key}') if not s3.upload_file(_file, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.copy_tsv_from_s3(f'{self.internal_schema}.{table}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def _upload_file_to_staging(table, _file): tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {table} feeds in S3: {s3_key}') if not s3.upload_file(_file, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.copy_tsv_from_s3( f"{app.config['REDSHIFT_SCHEMA_EDL']}.{table}", s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def run(self, load_mode='new'): all_sids = [row['sid'] for row in get_all_student_ids()] previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()} if load_mode == 'new': sids = list(set(all_sids).difference(previous_backfills)) elif load_mode == 'batch': new_sids = list(set(all_sids).difference(previous_backfills)) limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids) if limit > 0: oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)] sids = new_sids + oldest_backfills else: sids = new_sids elif load_mode == 'all': sids = all_sids app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...') rows = { 'term_gpas': [], 'last_registrations': [], 'api_demographics': [], } successes, failures = self.get_registration_data_per_sids(rows, sids) if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0): raise BackgroundJobError('Failed to import registration histories: aborting job.') for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv' app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.student_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.student_{table_key}); INSERT INTO {student_schema}.student_{table_key} (SELECT * FROM {student_schema}_staging.student_{table_key}); TRUNCATE TABLE {student_schema}_staging.student_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error inserting staging entries into destination: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Failed to refresh RDS indexes.') update_registration_import_status(successes, failures) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_degree_progress', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. # # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less # likely to be triggered.) if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting import of historical registration data for {len(sids)} students...' ) redshift.execute('VACUUM; ANALYZE;') rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.get_registration_data_per_sids( rows, sids, include_demographics=False) for key in rows.keys(): if len(rows[key]) > 0: s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.' ) if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( f'Error during S3 upload: {s3_key}. Aborting job.') staging_table = f'{student_schema()}_staging.hist_enr_{key}' if not redshift.execute(f'TRUNCATE {staging_table}'): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') app.logger.info( f'Populate {staging_table} (Redshift table) with s3:{s3_key}' ) if not redshift.copy_tsv_from_s3(staging_table, s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') app.logger.info( f'Insert student data into {student_schema()}.hist_enr_{key}' ) staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key}); INSERT INTO {student_schema}.hist_enr_{table_key} (SELECT * FROM {student_schema}_staging.hist_enr_{table_key}); TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) redshift.execute('VACUUM; ANALYZE;') return ( f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.' )
def run(self, csids=None, term_id=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] if not term_id: term_id = current_term_id() app.logger.info( f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...' ) rows = [] success_count = 0 no_enrollments_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})' ) feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id) if feed: success_count += 1 rows.append('\t'.join( [str(csid), str(term_id), json.dumps(feed)])) elif feed is False: app.logger.info( f'SID {csid} returned no enrollments for term {term_id}.') no_enrollments_count += 1 else: failure_count += 1 app.logger.error( f'SIS enrollments API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'" ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_drops_and_midterms', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms WHERE term_id = '{term_id}' AND sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{no_enrollments_count} returned no enrollments, {failure_count} failed.' )
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting term GPA import job for {len(csids)} students...') rows = [] success_count = 0 no_registrations_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})' ) feed = sis_student_api.get_term_gpas(csid) if feed: success_count += 1 for term_id, term_data in feed.items(): rows.append('\t'.join([ str(csid), str(term_id), str(term_data.get('gpa') or '0'), str(term_data.get('unitsTakenForGpa') or '0') ])) elif feed == {}: app.logger.info(f'No registrations found for SID {csid}.') no_registrations_count += 1 else: failure_count += 1 app.logger.error(f'Term GPA import failed for SID {csid}.') index += 1 if success_count == 0: app.logger.error('Failed to import term GPAs: aborting job.') return False s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.student_term_gpas' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.student_term_gpas', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string(""" DELETE FROM {redshift_schema_student}.student_term_gpas WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas); INSERT INTO {redshift_schema_student}.student_term_gpas (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas); TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas; """) if not redshift.execute(staging_to_destination_query): app.logger.error( 'Error inserting staging entries into destination: aborting job.' ) return False with rds.transaction() as transaction: if self.refresh_rds_indexes(csids, rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Failed to refresh RDS indexes.') return False return ( f'Term GPA import completed: {success_count} succeeded, ' f'{no_registrations_count} returned no registrations, {failure_count} failed.' )
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # The size of the non-advisee population makes it unlikely that a one-shot load of all these slow feeds will # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting registrations import job for {len(sids)} non-advisees...' ) rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.load_concurrently(rows, sids) if len(successes) > 0: for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( 'Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.hist_enr_{key}' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3( f'{self.redshift_schema}_staging.hist_enr_{key}', s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.hist_enr_{table_key}); INSERT INTO {redshift_schema_student}.hist_enr_{table_key} (SELECT * FROM {redshift_schema_student}_staging.hist_enr_{table_key}); TRUNCATE TABLE {redshift_schema_student}_staging.hist_enr_{table_key}; """, table_key=key, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )