def _query_edl(self, rows, sids): successes = [] for edl_row in get_edl_student_registrations(sids): sid = edl_row['student_id'] if sid not in successes: # Based on the SQL order_by, the first result per SID will be 'last_registration'. successes.append(sid) rows['last_registrations'].append( encoded_tsv_row([sid, json.dumps(edl_registration_to_json(edl_row))]), ) rows['term_gpas'].append( encoded_tsv_row( [ sid, edl_row['term_id'], edl_row['current_term_gpa'] or '0', edl_row.get('unt_taken_gpa') or '0', # TODO: Does EDL give us 'unitsTakenForGpa'? ], ), ) if self.include_demographics: rows[self.demographics_key].append( encoded_tsv_row([sid, json.dumps(edl_demographics_to_json(edl_row))]), ) failures = list(np.setdiff1d(sids, successes)) return successes, failures
def collect_merged_profiles(self, sids, feed_file, index_file, names_file): successes = [] sis_profile_feeds = queries.get_non_advisee_api_feeds(sids) for row in sis_profile_feeds: sid = row['sid'] uid = row['uid'] sis_api_feed = row['sis_feed'] sis_profile = parse_merged_sis_profile({ 'sis_profile_feed': sis_api_feed, 'last_registration_feed': row['last_registration_feed'], }) merged_profile = { 'sid': sid, 'uid': uid, 'sisProfile': sis_profile, } self.fill_names_from_sis_profile(sis_api_feed, merged_profile) feed_file.write( encoded_tsv_row([sid, uid, json.dumps(merged_profile)]) + b'\n') first_name = merged_profile.get('firstName', '') last_name = merged_profile.get('lastName', '') level = str(sis_profile.get('level', {}).get('code') or '') gpa = str(sis_profile.get('cumulativeGPA') or '') units = str(sis_profile.get('cumulativeUnits') or '') transfer = str(sis_profile.get('transfer') or False) expected_grad_term = str( sis_profile.get('expectedGraduationTerm', {}).get('id') or '') terms_in_attendance = str( sis_profile.get('termsInAttendance', {}) or '') index_file.write( encoded_tsv_row([ sid, uid, first_name, last_name, level, gpa, units, transfer, expected_grad_term, terms_in_attendance ]) + b'\n', ) names_file.write( encoded_tsv_row([ sid, merged_profile.get('uid'), merged_profile.get('firstName'), merged_profile.get('lastName'), ]) + b'\n', ) successes.append(sid) return len(successes)
def _load_from_student_api(self, all_sids): # Students API will not return 'unitsTransferEarned' and 'unitsTransferAccepted' data # for incoming transfer students unless we request an 'as-of-date' in their enrolled term. near_future = (datetime.now() + timedelta(days=60)).strftime('%Y-%m-%d') chunked_sids = [ all_sids[i:i + 100] for i in range(0, len(all_sids), 100) ] rows = [] failure_count = 0 app_obj = app._get_current_object() start_loop = timer() with ThreadPoolExecutor(max_workers=self.max_threads) as executor: for result in executor.map(async_get_feeds, repeat(app_obj), chunked_sids, repeat(near_future)): remaining_sids = set(result['sids']) feeds = result['feeds'] if feeds: for feed in feeds: sid = next(_id['id'] for _id in feed['identifiers'] if _id['type'] == 'student-id') remaining_sids.discard(sid) rows.append(encoded_tsv_row([sid, json.dumps(feed)])) if remaining_sids: failure_count += len(remaining_sids) app.logger.error( f'SIS student API import failed for SIDs {remaining_sids}.' ) app.logger.info( f'Wanted {len(all_sids)} students; got {len(rows)} in {timer() - start_loop} secs' ) return rows, failure_count
def load_concurrently(self, all_sids, feed_file): chunked_sids = [all_sids[i:i + 100] for i in range(0, len(all_sids), 100)] saved_sids = [] failure_count = 0 app_obj = app._get_current_object() start_loop = timer() with ThreadPoolExecutor(max_workers=self.max_threads) as executor: for result in executor.map(async_get_feeds, repeat(app_obj), chunked_sids): remaining_sids = set(result['sids']) feeds = result['feeds'] if feeds: for feed in feeds: sid = next((_id.get('id') for _id in feed['identifiers'] if _id.get('type') == 'student-id'), None) uid = next((_id.get('id') for _id in feed['identifiers'] if _id.get('type') == 'campus-uid'), None) if not sid or not uid: continue feed_file.write(encoded_tsv_row([sid, uid, json.dumps(feed)]) + b'\n') remaining_sids.discard(sid) saved_sids.append(sid) if remaining_sids: failure_count = failure_count + len(remaining_sids) app.logger.error(f'SIS student API import failed for non-advisees {remaining_sids}.') app.logger.info(f'Wanted {len(all_sids)} non-advisees; got {len(saved_sids)} in {timer() - start_loop} secs') return saved_sids, failure_count
def _query_student_api(self, rows, sids): successes = [] failures = [] app_obj = app._get_current_object() start_loop = timer() with ThreadPoolExecutor( max_workers=app.config['STUDENT_API_MAX_THREADS']) as executor: for result in executor.map(self._async_get_feed, repeat(app_obj), sids): sid = result['sid'] full_feed = result['feed'] if full_feed: successes.append(sid) rows['last_registrations'].append( encoded_tsv_row([ sid, json.dumps(full_feed.get('last_registration', {})) ]), ) gpa_feed = full_feed.get('term_gpas', {}) if gpa_feed: for term_id, term_data in gpa_feed.items(): row = [ sid, term_id, (term_data.get('gpa') or '0'), (term_data.get('unitsTakenForGpa') or '0'), ] rows['term_gpas'].append(encoded_tsv_row(row)) else: app.logger.info( f'No past UGRD registrations found for SID {sid}.') demographics = full_feed.get('demographics', {}) if demographics: rows['api_demographics'].append( encoded_tsv_row([sid, json.dumps(demographics)]), ) else: failures.append(sid) app.logger.error( f'Registration history import failed for SID {sid}.') app.logger.info( f'Wanted {len(sids)} students; got {len(successes)} in {timer() - start_loop} secs' ) return successes, failures
def import_advisor_attributes(self): csid_results = redshift.fetch( resolve_sql_template_string( 'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students' ), ) csids = [r['advisor_sid'] for r in csid_results] all_attributes = calnet.client(app).search_csids(csids) if len(csids) != len(all_attributes): ldap_csids = [person['csid'] for person in all_attributes] missing = set(csids) - set(ldap_csids) app.logger.warning( f'Looked for {len(csids)} advisor CSIDs but only found {len(all_attributes)} : missing {missing}' ) advisor_rows = [] total_count = len(all_attributes) for index, a in enumerate(all_attributes): sid = a['csid'] app.logger.info( f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {total_count})' ) first_name, last_name = calnet.split_sortable_name(a) data = [ a['uid'], sid, first_name, last_name, a['title'], calnet.get_dept_code(a), a['email'], a['campus_email'], ] advisor_rows.append(encoded_tsv_row(data)) s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv' app.logger.info( f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(advisor_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes; COPY {redshift_schema_advisor_internal}.advisor_attributes FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False
def collect_merged_enrollments(self, sids, term_id, feed_file): rows = queries.get_non_advisee_sis_enrollments(sids, term_id) enrollments_by_student = map_sis_enrollments(rows) merge_dropped_classes( enrollments_by_student, queries.get_non_advisee_enrollment_drops(sids, term_id)) merge_term_gpas(enrollments_by_student, queries.get_non_advisee_term_gpas(sids, term_id)) enrollments_by_student = enrollments_by_student.get(term_id, {}) for (sid, enrollments_feed) in enrollments_by_student.items(): feed_file.write( encoded_tsv_row([sid, term_id, json.dumps(enrollments_feed)]) + b'\n') return len(enrollments_by_student.keys())
def generate_term_feeds(self, sids, feed_file): enrollment_stream = queries.stream_sis_enrollments(sids=sids) term_gpa_stream = queries.stream_term_gpas(sids=sids) term_gpa_tracker = {'term_id': '9999', 'sid': '', 'term_gpas': []} row_count = 0 try: term_gpa_results = groupby(term_gpa_stream, lambda r: (str(r['term_id']), r['sid'])) for term_id, term_enrollments_grp in groupby( enrollment_stream, operator.itemgetter('sis_term_id')): term_id = str(term_id) term_name = berkeley.term_name_for_sis_id(term_id) for sid, enrollments_grp in groupby( term_enrollments_grp, operator.itemgetter('sid')): term_feed = None for is_dropped, enrollments_subgroup in groupby( enrollments_grp, operator.itemgetter('dropped')): if not is_dropped: term_feed = merge_enrollment( enrollments_subgroup, term_id, term_name) else: if not term_feed: term_feed = empty_term_feed(term_id, term_name) append_drops(term_feed, enrollments_subgroup) while term_gpa_tracker['term_id'] > term_id or ( term_gpa_tracker['term_id'] == term_id and term_gpa_tracker['sid'] < sid): (term_gpa_tracker['term_id'], term_gpa_tracker['sid'] ), term_gpa_tracker['term_gpas'] = next( term_gpa_results) if term_gpa_tracker[ 'term_id'] == term_id and term_gpa_tracker[ 'sid'] == sid: append_term_gpa(term_feed, term_gpa_tracker['term_gpas']) feed_file.write( encoded_tsv_row([sid, term_id, json.dumps(term_feed)]) + b'\n') row_count += 1 finally: enrollment_stream.close() term_gpa_stream.close() return row_count
def load_concurrently_v1(self, csids): rows = [] failure_count = 0 app_obj = app._get_current_object() start_loop = timer() with ThreadPoolExecutor(max_workers=self.max_threads) as executor: for result in executor.map(async_get_feed_v1, repeat(app_obj), csids): csid = result['sid'] feed = result['feed'] if feed: rows.append(encoded_tsv_row([csid, json.dumps(feed)])) else: failure_count += 1 app.logger.error(f'SIS student API V1 import failed for CSID {csid}.') app.logger.info(f'Wanted {len(csids)} students; got {len(rows)} in {timer() - start_loop} secs') return rows, failure_count
def refresh_student_enrollment_term(self, term_id, enrollment_term_map): with tempfile.TemporaryFile() as enrollment_term_file: for (sid, sid_term_feed) in enrollment_term_map.items(): enrollment_term_file.write( encoded_tsv_row([sid, term_id, json.dumps(sid_term_feed)]) + b'\n') drop_staged_enrollment_term(term_id) write_file_to_staging('student_enrollment_terms', enrollment_term_file, len(enrollment_term_map), term_id) with redshift.transaction() as transaction: refresh_from_staging('student_enrollment_terms', term_id, None, transaction, truncate_staging=False) if not transaction.commit(): raise BackgroundJobError( f'Final transaction commit failed on enrollment term refresh (term_id={term_id}).' )
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = enrollment.get('user_id') last_activity_at = enrollment.get('last_activity_at') or '' rows.append( encoded_tsv_row([ course_id, user_id, term_id, last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; COPY {redshift_schema_student}_staging.canvas_api_enrollments FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ'; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def run(self): app.logger.info('Starting ASC profile generation job...') asc_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)', schema=asc_schema_identifier, ) profile_rows = [] sids_for_inactive_deletion = [] for sid, rows_for_student in groupby(asc_rows, operator.itemgetter('sid')): rows_for_student = list(rows_for_student) # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not # a team membership, a bit of brutal simplification is needed. Students who are active in at least # one sport have inactive team memberships dropped. any_active_athletics = reduce( operator.or_, [r['active'] for r in rows_for_student], False) if any_active_athletics: rows_for_student = [r for r in rows_for_student if r['active']] sids_for_inactive_deletion.append(sid) athletics_profile = { 'athletics': [], 'inIntensiveCohort': rows_for_student[0]['intensive'], 'isActiveAsc': rows_for_student[0]['active'], 'statusAsc': rows_for_student[0]['status_asc'], } for row in rows_for_student: athletics_profile['athletics'].append({ 'groupCode': row['group_code'], 'groupName': row['group_name'], 'name': row['group_name'], 'teamCode': row['team_code'], 'teamName': row['team_name'], }) profile_rows.append( encoded_tsv_row([sid, json.dumps(athletics_profile)])) s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.student_profiles; COPY {redshift_schema_asc}.student_profiles FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False with rds.transaction() as transaction: if self.refresh_rds_indexes(asc_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') if sids_for_inactive_deletion: redshift.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) rds.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) return 'ASC profile generation complete.'
def run(self): app.logger.info('Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info('COE external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('COE external schema creation failed.') coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), 'sat1read': row_for_student.get('sat1read'), 'sat1math': row_for_student.get('sat1math'), 'sat2math': row_for_student.get('sat2math'), 'inMet': row_for_student.get('in_met'), 'gradTerm': row_for_student.get('grad_term'), 'gradYear': row_for_student.get('grad_year'), 'probation': row_for_student.get('probation'), 'status': row_for_student.get('status'), } profile_rows.append(encoded_tsv_row([sid, json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'COE internal schema created.'
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = enrollment.get('user_id') last_activity_at = enrollment.get('last_activity_at') or '' rows.append( encoded_tsv_row([ course_id, user_id, term_id, last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments ( course_id VARCHAR, user_id VARCHAR, term_id VARCHAR, last_activity_at TIMESTAMP, feed VARCHAR ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/canvas_api_enrollments'; DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; INSERT INTO {redshift_schema_student}_staging.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments); DROP TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def run(self, csids=None): if not csids: all_sids = get_all_student_ids() if all_sids: csids = [row['sid'] for row in all_sids] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append(encoded_tsv_row([csid, json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3( f'{self.redshift_schema}_staging.sis_api_degree_progress', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self, csids=None): if not csids: all_sids = get_all_student_ids() if all_sids: csids = [row['sid'] for row in all_sids] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append(encoded_tsv_row([csid, json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress ( sid VARCHAR, feed VARCHAR(MAX) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/degree_progress'; DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress; """, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') redshift.execute('VACUUM; ANALYZE;') return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self): app.logger.info( 'ASC import: Fetch team and student athlete data from ASC API') api_results = get_asc_feed() if 'error' in api_results: raise BackgroundJobError( 'ASC import: Error from external API: {}'.format( api_results['error'])) elif not api_results: raise BackgroundJobError('ASC import: API returned zero students') sync_date = api_results[0]['SyncDate'] if sync_date != api_results[-1]['SyncDate']: raise BackgroundJobError( f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}' ) rows = [] for r in api_results: if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r['SportCode']: asc_code = r['SportCodeCore'] if asc_code in SPORT_TRANSLATIONS: group_code = r['SportCode'] data = [ r['SID'], str(r.get('ActiveYN', 'No') == 'Yes'), str(r.get('IntensiveYN', 'No') == 'Yes'), r.get('SportStatus', ''), group_code, _unambiguous_group_name(r['Sport'], group_code), SPORT_TRANSLATIONS[asc_code], r['SportCore'], ] rows.append(encoded_tsv_row(data)) else: sid = r['SID'] app.logger.error( f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}' ) s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv' if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Copy data in S3 file to Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.students; COPY {redshift_schema_asc}.students FROM 's3://{s3_bucket}/{s3_key}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, s3_bucket=app.config['LOCH_S3_BUCKET'], s3_key=s3_key, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') status = { 'this_sync_date': sync_date, 'api_results_count': len(api_results), } app.logger.info( f'ASC import: Successfully completed import job: {str(status)}') return status