def run(self, csids=None): if app.config['STUDENT_V1_API_PREFERRED']: return self.run_v1(csids) if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info(f'Starting SIS student API import job for {len(csids)} students...') rows, failure_count = self.load_concurrently(csids) if (len(rows) == 0) and (failure_count > 0): raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.') s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {self.redshift_schema}_staging.sis_api_profiles'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{self.redshift_schema}_staging.sis_api_profiles', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting CalNet import job for {len(csids)} students...') _put_calnet_data_to_s3(csids) app.logger.info('CalNet import: done') return True
def run(self, advisee_csids=None, instructor_uids=None): if not advisee_csids: advisee_csids = [row['sid'] for row in get_all_student_ids()] if not instructor_uids: instructor_uids = [row['instructor_uid'] for row in get_all_instructor_uids()] _put_advisee_data_to_s3(advisee_csids) _put_instructor_data_to_s3(instructor_uids) return True
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS student API import job for {len(csids)} students...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})' ) feed = sis_student_api.get_student(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) else: failure_count += 1 app.logger.error( f'SIS student API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_profiles', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'
def run(self, term_id=None, backfill_new_students=False): app.logger.info( f'Starting merged profile generation job (term_id={term_id}, backfill={backfill_new_students}).' ) app.logger.info('Cleaning up old data...') redshift.execute('VACUUM; ANALYZE;') if backfill_new_students: status = '' previous_backfills = { row['sid'] for row in get_successfully_backfilled_students() } sids = {row['sid'] for row in get_all_student_ids()} old_sids = sids.intersection(previous_backfills) new_sids = sids.difference(previous_backfills) # Any students without a previous backfill will have feeds generated for all terms. Students with a previous # backfill get an update for the requested term only. if len(new_sids): app.logger.info( f'Found {len(new_sids)} new students, will backfill all terms.' ) ImportTermGpas().run(csids=new_sids) backfill_status = self.generate_feeds(sids=list(new_sids)) if not backfill_status: app.logger.warn( 'Backfill job aborted, will continue with non-backfill job.' ) backfill_status = 'aborted' else: app.logger.info(f'Backfill complete.') status += f'Backfill: {backfill_status}; non-backfill: ' app.logger.info( f'Will continue merged feed job for {len(old_sids)} previously backfilled students.' ) continuation_status = self.generate_feeds(sids=list(old_sids), term_id=term_id) if not continuation_status: return False status += continuation_status else: status = self.generate_feeds(term_id) # Clean up the workbench. redshift.execute('VACUUM; ANALYZE;') app.logger.info(f'Vacuumed and analyzed.') return status
def run(self, load_mode='new'): all_sids = [row['sid'] for row in get_all_student_ids()] previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()} if load_mode == 'new': sids = list(set(all_sids).difference(previous_backfills)) elif load_mode == 'batch': new_sids = list(set(all_sids).difference(previous_backfills)) limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids) if limit > 0: oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)] sids = new_sids + oldest_backfills else: sids = new_sids elif load_mode == 'all': sids = all_sids app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...') rows = { 'term_gpas': [], 'last_registrations': [], 'api_demographics': [], } successes, failures = self.get_registration_data_per_sids(rows, sids) if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0): raise BackgroundJobError('Failed to import registration histories: aborting job.') for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv' app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.student_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.student_{table_key}); INSERT INTO {student_schema}.student_{table_key} (SELECT * FROM {student_schema}_staging.student_{table_key}); TRUNCATE TABLE {student_schema}_staging.student_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error inserting staging entries into destination: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Failed to refresh RDS indexes.') update_registration_import_status(successes, failures) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_degree_progress', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self, csids=None, term_id=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] if not term_id: term_id = current_term_id() app.logger.info( f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...' ) rows = [] success_count = 0 no_enrollments_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})' ) feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id) if feed: success_count += 1 rows.append('\t'.join( [str(csid), str(term_id), json.dumps(feed)])) elif feed is False: app.logger.info( f'SID {csid} returned no enrollments for term {term_id}.') no_enrollments_count += 1 else: failure_count += 1 app.logger.error( f'SIS enrollments API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'" ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_drops_and_midterms', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms WHERE term_id = '{term_id}' AND sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{no_enrollments_count} returned no enrollments, {failure_count} failed.' )
def _get_advisee_sids_without_photos(): previous_imports = {row['sid'] for row in get_advisee_sids_with_photos()} advisee_sids = {row['sid'] for row in get_all_student_ids()} return list(advisee_sids.difference(previous_imports))
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting term GPA import job for {len(csids)} students...') rows = [] success_count = 0 no_registrations_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})' ) feed = sis_student_api.get_term_gpas(csid) if feed: success_count += 1 for term_id, term_data in feed.items(): rows.append('\t'.join([ str(csid), str(term_id), str(term_data.get('gpa') or '0'), str(term_data.get('unitsTakenForGpa') or '0') ])) elif feed == {}: app.logger.info(f'No registrations found for SID {csid}.') no_registrations_count += 1 else: failure_count += 1 app.logger.error(f'Term GPA import failed for SID {csid}.') index += 1 if success_count == 0: app.logger.error('Failed to import term GPAs: aborting job.') return False s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.student_term_gpas' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.student_term_gpas', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string(""" DELETE FROM {redshift_schema_student}.student_term_gpas WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas); INSERT INTO {redshift_schema_student}.student_term_gpas (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas); TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas; """) if not redshift.execute(staging_to_destination_query): app.logger.error( 'Error inserting staging entries into destination: aborting job.' ) return False with rds.transaction() as transaction: if self.refresh_rds_indexes(csids, rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Failed to refresh RDS indexes.') return False return ( f'Term GPA import completed: {success_count} succeeded, ' f'{no_registrations_count} returned no registrations, {failure_count} failed.' )
def run(self, csids=None): if not csids: all_sids = get_all_student_ids() if all_sids: csids = [row['sid'] for row in all_sids] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append(encoded_tsv_row([csid, json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress ( sid VARCHAR, feed VARCHAR(MAX) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/degree_progress'; DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress; """, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') redshift.execute('VACUUM; ANALYZE;') return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )