def _put_instructor_data_to_s3(uids): app.logger.info(f'Starting CalNet import job for {len(uids)} instructors...') all_attributes = calnet.client(app).search_uids(uids) if len(uids) != len(all_attributes): ldap_uids = [person['uid'] for person in all_attributes] missing = set(uids) - set(ldap_uids) app.logger.warning(f'Looked for {len(uids)} instructor UIDs but only found {len(all_attributes)} : missing {missing}') serialized_data = '' for index, a in enumerate(all_attributes): uid = a['uid'] affiliations = a['affiliations'] first_name, last_name = calnet.split_sortable_name(a) serialized_data += json.dumps({ 'affiliations': ','.join(affiliations) if isinstance(affiliations, list) else affiliations, 'campus_email': a['campus_email'], 'dept_code': calnet.get_dept_code(a), 'email': a['email'], 'first_name': first_name, 'last_name': last_name, 'ldap_uid': uid, 'csid': a['csid'], 'title': a['title'], }) + '\n' s3.upload_data(serialized_data, f'{get_s3_calnet_daily_path()}/instructors/instructors.json') app.logger.info(f'Uploaded data for {len(all_attributes)} instructors')
def _put_booking_data_to_s3(date): datestamp = date.strftime('%Y-%m-%d') app.logger.info(f'Starting YCBM bookings import for {datestamp}...') bookings = ycbm_api.get_bookings_for_date(date) if not bookings or not len(bookings): app.logger.info(f'No bookings found for {datestamp}') return imported_at = utc_now().strftime('%Y-%m-%dT%H:%M:%SZ') serialized_data = '' for b in bookings: b['importedAt'] = imported_at # Make JsonSerDe schema creation easier in Redshift: transform arrays to dicts, and output one JSON record per line in text file in S3. answers_dict = {} for a in b.get('answers', []): if 'code' in a and 'string' in a: answers_dict[a['code'].lower()] = a['string'] b['answers'] = answers_dict serialized_data += json.dumps(b) + '\n' # Upload one copy to the daily path, which we keep for a few days in S3 in case something goes wrong and we need to # recover an earlier run. s3.upload_data( serialized_data, f'{get_s3_ycbm_daily_path()}/bookings/{datestamp}/bookings.json') # Upload one copy to the archive path which we expect to keep as our permanent record. s3.upload_data( serialized_data, f"{app.config['LOCH_S3_YCBM_DATA_PATH']}/archive/{hashed_datestamp(date)}/bookings/bookings.json" ) app.logger.info( f'Uploaded data for {len(bookings)} bookings on {datestamp}')
def _put_advisee_data_to_s3(sids): app.logger.info(f'Starting CalNet import job for {len(sids)} advisees...') all_attributes = calnet.client(app).search_csids(sids) if len(sids) != len(all_attributes): ldap_sids = [person['csid'] for person in all_attributes] missing = set(sids) - set(ldap_sids) app.logger.warning(f'Looked for {len(sids)} advisee SIDs but only found {len(all_attributes)} : missing {missing}') serialized_data = '' for index, a in enumerate(all_attributes): sid = a['csid'] affiliations = a['affiliations'] first_name, last_name = calnet.split_sortable_name(a) # JsonSerDe in Redshift schema creation requires one and only one JSON record per line in text file in S3. serialized_data += json.dumps({ 'affiliations': ','.join(affiliations) if isinstance(affiliations, list) else affiliations, 'campus_email': a['campus_email'], 'email': a['email'], 'first_name': first_name, 'last_name': last_name, 'ldap_uid': a['uid'], 'sid': sid, }) + '\n' s3.upload_data(serialized_data, f'{get_s3_calnet_daily_path()}/advisees/advisees.json') app.logger.info(f'Uploaded data for {len(all_attributes)} advisees')
def update_manifests(self): app.logger.info('Updating manifests...') # Because the SIS S3 copy is managed by a different application running on a different schedule, # it may have been made before midnight by Nessie-time. s3_sis_daily = get_s3_sis_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = get_s3_sis_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_sis_daily): raise BackgroundJobError('No timely SIS S3 data found') else: app.logger.info('Falling back to SIS S3 daily data for yesterday') courses_daily = s3.get_keys_with_prefix(s3_sis_daily + '/courses', full_objects=True) courses_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/courses', full_objects=True) enrollments_daily = s3.get_keys_with_prefix(s3_sis_daily + '/enrollments', full_objects=True) enrollments_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/enrollments', full_objects=True) def deduplicate(prefix, s3list): filename_map = {} for s3obj in s3list: m = re.match(r'.+/(.+\.gz)', s3obj['Key']) if m: filename_map[m[1]] = s3obj for term_id in reverse_term_ids(include_future_terms=True): filename = f'{prefix}-{term_id}.gz' if filename not in filename_map: raise BackgroundJobError(f'Expected filename {filename} not found in S3, aborting') return list(filename_map.values()) all_courses = deduplicate('courses', courses_daily + courses_historical) all_enrollments = deduplicate('enrollments', enrollments_daily + enrollments_historical) def to_manifest_entry(_object): return { 'url': f"s3://{app.config['LOCH_S3_BUCKET']}/{_object['Key']}", 'meta': {'content_length': _object['Size']}, } def to_manifest(objects): return { 'entries': [to_manifest_entry(o) for o in objects], } courses_manifest = json.dumps(to_manifest(all_courses)) enrollments_manifest = json.dumps(to_manifest(all_enrollments)) courses_result = s3.upload_data(courses_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/courses.json') enrollments_result = s3.upload_data(enrollments_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/enrollments.json') return courses_result and enrollments_result
def update_merged_feed_status(term_id, successes, failures): term_id = term_id or 'all' redshift.execute( 'DELETE FROM {schema}.merged_feed_status WHERE sid = ANY(%s) AND term_id = %s', schema=_schema(), params=((successes + failures), term_id), ) now = datetime.utcnow().isoformat() success_records = ['\t'.join([sid, term_id, 'success', now]) for sid in successes] failure_records = ['\t'.join([sid, term_id, 'failure', now]) for sid in failures] rows = success_records + failure_records s3_key = f'{get_s3_sis_api_daily_path()}/merged_feed_status.tsv' if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error uploading merged feed status updates to S3.') return query = resolve_sql_template_string( """ COPY {redshift_schema_metadata}.merged_feed_status FROM '{loch_s3_sis_api_data_path}/merged_feed_status.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SS'; """ ) if not redshift.execute(query): app.logger.error('Error copying merged feed status updates to Redshift.')
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS student API import job for {len(csids)} students...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})' ) feed = sis_student_api.get_student(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) else: failure_count += 1 app.logger.error( f'SIS student API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_profiles', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'
def run(self, csids=None): if not csids: csids = _get_advisee_sids_without_photos() id_mappings = get_advisee_ids(csids=csids) id_mappings_by_sid = {r['sid']: r['ldap_uid'] for r in id_mappings} app.logger.info( f'Starting student photo import job for {len(csids)} students...') successes = [] failures = [] photo_not_found = [] index = 0 for csid in csids: index += 1 app.logger.info( f'Fetching photo for SID {csid}, ({index} of {len(csids)})') uid = id_mappings_by_sid.get(csid) if not uid: app.logger.error(f'No UID found for SID {csid}.') failures.append(csid) continue photo = get_cal1card_photo(uid) if photo: s3_photo_key = f"{app.config['LOCH_S3_CAL1CARD_PHOTOS_PATH']}/{uid}.jpg" if s3.upload_data(photo, s3_photo_key, bucket=app.config['LOCH_S3_PUBLIC_BUCKET']): successes.append(csid) else: app.logger.error(f'Photo upload failed for SID {csid}.') failures.append(csid) elif photo is False: app.logger.info(f'No photo returned for SID {csid}.') photo_not_found.append(csid) elif photo is None: app.logger.error(f'Photo import failed for SID {csid}.') failures.append(csid) sleep(app.config['CAL1CARD_PHOTO_API_THROTTLE']) if (len(successes) == 0) and (len(photo_not_found) == 0) and (len(failures) > 0): raise BackgroundJobError('Failed to import student photos.') else: update_photo_import_status(successes, failures, photo_not_found) status = 'Student photo import completed: ' if len(successes): status += f'{len(successes)} succeeded, ' if len(photo_not_found): status += f'{len(photo_not_found)} had no photo available, ' status += f'{len(failures)} failed.' return status
def _put_calnet_data_to_s3(sids): all_attributes = calnet.client(app).search_csids(sids) if len(sids) != len(all_attributes): ldap_sids = [l['csid'] for l in all_attributes] missing = set(sids) - set(ldap_sids) app.logger.warning( f'Looked for {len(sids)} SIDs but only found {len(all_attributes)} : missing {missing}' ) serialized_data = '' total_count = len(all_attributes) for index, a in enumerate(all_attributes): sid = a['csid'] app.logger.info( f'CalNet import: Fetch attributes of student {sid} ({index + 1} of {total_count})' ) affiliations = a['affiliations'] first_name, last_name = _split_sortable_name(a) # JsonSerDe in Redshift schema creation requires one and only one JSON record per line in text file in S3. serialized_data += json.dumps({ 'affiliations': ','.join(affiliations) if isinstance(affiliations, list) else affiliations, 'campus_email': a['campus_email'], 'email': a['email'], 'first_name': first_name, 'last_name': last_name, 'ldap_uid': a['uid'], 'sid': sid, }) + '\n' s3.upload_data(serialized_data, f'{get_s3_calnet_daily_path()}/persons.json')
def upload_to_staging(self, table): rows = self.rows[table] s3_key = f'{get_s3_sis_api_daily_path()}/staging_{table}.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/staging_{table}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=self.staging_schema, table=table, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False
def upload_file_to_staging(table, term_file, row_count, term_id): tsv_filename = f'staging_{table}_{term_id}.tsv' if term_id else f'staging_{table}.tsv' s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}') # Be kind; rewind term_file.seek(0) if not s3.upload_data(term_file, s3_key): raise BackgroundJobError( f'Failed upload {row_count} records to s3:{s3_key}. Aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/{tsv_filename}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=staging_schema(), table=table, tsv_filename=tsv_filename, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def run(self, term_ids=None): if not term_ids: term_ids = reverse_term_ids() app.logger.info( f'Starting SIS terms API import job for {len(term_ids)} terms...') rows = [] success_count = 0 failure_count = 0 index = 1 for term_id in term_ids: app.logger.info( f'Fetching SIS terms API for term id {term_id} ({index} of {len(term_ids)})' ) feed = sis_terms_api.get_term(term_id) if feed: success_count += 1 for academic_career_term in feed: for session in academic_career_term.get('sessions', []): rows.append( '\t'.join([ academic_career_term.get('id', ''), academic_career_term.get('name', ''), academic_career_term.get('academicCareer', {}).get('code', ''), academic_career_term.get('beginDate', ''), academic_career_term.get('endDate', ''), session.get('id', ''), session.get('name', ''), session.get('beginDate', ''), session.get('endDate', ''), ]), ) else: failure_count += 1 app.logger.error( f'SIS terms API import failed for term id {term_id}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/terms.tsv' app.logger.info( f'Will stash {len(rows)} rows from {success_count} feeds in S3: {s3_key}' ) if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') with redshift.transaction() as transaction: if self.update_redshift(term_ids, transaction): transaction.commit() app.logger.info('Updated Redshift.') else: transaction.rollback() app.logger.error('Failed to update Redshift.') return False with rds.transaction() as transaction: if self.update_rds(rows, term_ids, transaction): transaction.commit() app.logger.info('Updated RDS.') else: transaction.rollback() app.logger.error('Failed to update RDS.') return False return f'SIS terms API import job completed: {success_count} succeeded, {failure_count} failed.'
def run(self): app.logger.info(f'Starting ASC profile generation job...') asc_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)', schema=asc_schema_identifier, ) profile_rows = [] sids_for_inactive_deletion = [] for sid, rows_for_student in groupby(asc_rows, operator.itemgetter('sid')): rows_for_student = list(rows_for_student) # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not # a team membership, a bit of brutal simplification is needed. Students who are active in at least # one sport have inactive team memberships dropped. any_active_athletics = reduce(operator.or_, [r['active'] for r in rows_for_student], False) if any_active_athletics: rows_for_student = [r for r in rows_for_student if r['active']] sids_for_inactive_deletion.append(sid) athletics_profile = { 'athletics': [], 'inIntensiveCohort': rows_for_student[0]['intensive'], 'isActiveAsc': rows_for_student[0]['active'], 'statusAsc': rows_for_student[0]['status_asc'], } for row in rows_for_student: athletics_profile['athletics'].append({ 'groupCode': row['group_code'], 'groupName': row['group_name'], 'name': row['group_name'], 'teamCode': row['team_code'], 'teamName': row['team_name'], }) profile_rows.append('\t'.join([str(sid), json.dumps(athletics_profile)])) s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv' app.logger.info(f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(profile_rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.student_profiles; COPY {redshift_schema_asc}.student_profiles FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; VACUUM; ANALYZE; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False with rds.transaction() as transaction: if self.refresh_rds_indexes(asc_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Error refreshing RDS indexes.') return False if sids_for_inactive_deletion: redshift.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion,), ) rds.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion,), ) return 'ASC profile generation complete.'
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_degree_progress', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self): app.logger.info(f'Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info(f'COE external schema created.') if not verify_external_schema(external_schema, resolved_ddl): return False else: app.logger.error(f'COE external schema creation failed.') return False coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), } profile_rows.append('\t'.join([str(sid), json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(profile_rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; VACUUM; ANALYZE; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Error refreshing RDS indexes.') return False return 'COE internal schema created.'
def run(self, csids=None, term_id=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] if not term_id: term_id = current_term_id() app.logger.info( f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...' ) rows = [] success_count = 0 no_enrollments_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})' ) feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id) if feed: success_count += 1 rows.append('\t'.join( [str(csid), str(term_id), json.dumps(feed)])) elif feed is False: app.logger.info( f'SID {csid} returned no enrollments for term {term_id}.') no_enrollments_count += 1 else: failure_count += 1 app.logger.error( f'SIS enrollments API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'" ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_drops_and_midterms', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms WHERE term_id = '{term_id}' AND sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{no_enrollments_count} returned no enrollments, {failure_count} failed.' )
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting term GPA import job for {len(csids)} students...') rows = [] success_count = 0 no_registrations_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})' ) feed = sis_student_api.get_term_gpas(csid) if feed: success_count += 1 for term_id, term_data in feed.items(): rows.append('\t'.join([ str(csid), str(term_id), str(term_data.get('gpa') or '0'), str(term_data.get('unitsTakenForGpa') or '0') ])) elif feed == {}: app.logger.info(f'No registrations found for SID {csid}.') no_registrations_count += 1 else: failure_count += 1 app.logger.error(f'Term GPA import failed for SID {csid}.') index += 1 if success_count == 0: app.logger.error('Failed to import term GPAs: aborting job.') return False s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.student_term_gpas' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.student_term_gpas', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string(""" DELETE FROM {redshift_schema_student}.student_term_gpas WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas); INSERT INTO {redshift_schema_student}.student_term_gpas (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas); TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas; """) if not redshift.execute(staging_to_destination_query): app.logger.error( 'Error inserting staging entries into destination: aborting job.' ) return False with rds.transaction() as transaction: if self.refresh_rds_indexes(csids, rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Failed to refresh RDS indexes.') return False return ( f'Term GPA import completed: {success_count} succeeded, ' f'{no_registrations_count} returned no registrations, {failure_count} failed.' )
def _upload_data_to_s3(self, daily_path, historical_path): s3.upload_data('some futuristic course data', f'{daily_path}/courses/courses-2182.gz') s3.upload_data('some futuristic enrollment data', f'{daily_path}/enrollments/enrollments-2182.gz') s3.upload_data('some new course data', f'{daily_path}/courses/courses-2178.gz') s3.upload_data('some new enrollment data', f'{daily_path}/enrollments/enrollments-2178.gz') s3.upload_data('some old course data', f'{historical_path}/courses/courses-2175.gz') s3.upload_data('some old enrollment data', f'{historical_path}/enrollments/enrollments-2175.gz') s3.upload_data('some older course data', f'{historical_path}/courses/courses-2172.gz') s3.upload_data('some older enrollment data', f'{historical_path}/enrollments/enrollments-2172.gz') s3.upload_data('some perfectly antique course data', f'{historical_path}/courses/courses-2168.gz') s3.upload_data('some perfectly antique enrollment data', f'{historical_path}/enrollments/enrollments-2168.gz') s3.upload_data('some ancient course data', f'{historical_path}/courses/courses-2165.gz') s3.upload_data('some ancient enrollment data', f'{historical_path}/enrollments/enrollments-2165.gz')
def run(self): app.logger.info( f'ASC import: Fetch team and student athlete data from ASC API') api_results = get_asc_feed() if 'error' in api_results: app.logger.error('ASC import: Error from external API: {}'.format( api_results['error'])) status = False elif not api_results: app.logger.error('ASC import: API returned zero students') status = False else: sync_date = api_results[0]['SyncDate'] if sync_date != api_results[-1]['SyncDate']: app.logger.error( f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}' ) status = False else: rows = [] for r in api_results: if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r[ 'SportCode']: asc_code = r['SportCodeCore'] if asc_code in SPORT_TRANSLATIONS: group_code = r['SportCode'] data = [ r['SID'], str(r.get('ActiveYN', 'No') == 'Yes'), str(r.get('IntensiveYN', 'No') == 'Yes'), r.get('SportStatus', ''), group_code, _unambiguous_group_name( r['Sport'], group_code), SPORT_TRANSLATIONS[asc_code], r['SportCore'], ] rows.append('\t'.join(data)) else: sid = r['SID'] app.logger.error( f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}' ) s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv' if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False status = { 'this_sync_date': sync_date, 'api_results_count': len(api_results), } app.logger.info('Copy data in S3 file to Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.students; COPY {redshift_schema_asc}.students FROM 's3://{s3_bucket}/{s3_key}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; VACUUM; ANALYZE; """, s3_bucket=app.config['LOCH_S3_BUCKET'], s3_key=s3_key, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False app.logger.info( f'ASC import: Successfully completed import job: {str(status)}' ) return status
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = str(enrollment.get('user_id')) last_activity_at = str( enrollment.get('last_activity_at') or '') rows.append('\t'.join([ str(course_id), user_id, str(term_id), last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; COPY {redshift_schema_student}_staging.canvas_api_enrollments FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ'; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def test_fallback_update_manifests(self, app): """Uses yesterday's news if today's is unavailable.""" with mock_s3(app): yesterday = datetime.now() - timedelta(days=1) daily_path = get_s3_sis_daily_path(yesterday) historical_path = app.config[ 'LOCH_S3_SIS_DATA_PATH'] + '/historical' manifest_path = app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests' s3.upload_data('some new course data', f'{daily_path}/courses/courses-aaa.gz') s3.upload_data('some more new course data', f'{daily_path}/courses/courses-bbb.gz') s3.upload_data('some new enrollment data', f'{daily_path}/enrollments/enrollments-ccc.gz') s3.upload_data('some old course data', f'{historical_path}/courses/courses-ddd.gz') s3.upload_data( 'some old enrollment data', f'{historical_path}/enrollments/enrollments-eee.gz') s3.upload_data( 'some perfectly antique enrollment data', f'{historical_path}/enrollments/enrollments-fff.gz') assert CreateSisSchema().update_manifests() courses_manifest = json.loads( s3.get_object_text(manifest_path + '/courses.json')) assert len(courses_manifest['entries']) == 3 assert courses_manifest['entries'][0][ 'url'] == f's3://{app.config["LOCH_S3_BUCKET"]}/{daily_path}/courses/courses-aaa.gz' assert courses_manifest['entries'][0]['meta'][ 'content_length'] == 20 enrollments_manifest = json.loads( s3.get_object_text(manifest_path + '/enrollments.json')) assert len(enrollments_manifest['entries']) == 3 assert ( enrollments_manifest['entries'][2]['url'] == f's3://{app.config["LOCH_S3_BUCKET"]}/{historical_path}/enrollments/enrollments-fff.gz' ) assert enrollments_manifest['entries'][2]['meta'][ 'content_length'] == 38