def test_term_id_lists(self, app): all_term_ids = set( berkeley.reverse_term_ids(include_future_terms=True, include_legacy_terms=True)) canvas_integrated_term_ids = set(berkeley.reverse_term_ids()) future_term_ids = set(berkeley.future_term_ids()) legacy_term_ids = set(berkeley.legacy_term_ids()) assert canvas_integrated_term_ids < all_term_ids assert berkeley.sis_term_id_for_name( app.config['EARLIEST_LEGACY_TERM']) in all_term_ids assert berkeley.sis_term_id_for_name( app.config['EARLIEST_TERM']) in all_term_ids assert berkeley.sis_term_id_for_name( app.config['CURRENT_TERM']) in all_term_ids assert berkeley.sis_term_id_for_name( app.config['FUTURE_TERM']) in all_term_ids assert berkeley.current_term_id() in canvas_integrated_term_ids assert berkeley.earliest_term_id() in canvas_integrated_term_ids assert future_term_ids.isdisjoint(canvas_integrated_term_ids) assert future_term_ids < all_term_ids assert berkeley.future_term_id() in future_term_ids assert legacy_term_ids.isdisjoint(canvas_integrated_term_ids) assert legacy_term_ids < all_term_ids assert berkeley.earliest_legacy_term_id() in berkeley.legacy_term_ids()
def test_auto_terms(self, app, current_term_index): all_term_ids = set(berkeley.reverse_term_ids(include_future_terms=True, include_legacy_terms=True)) canvas_integrated_term_ids = set(berkeley.reverse_term_ids()) assert canvas_integrated_term_ids < all_term_ids assert berkeley.current_term_id() == '2182' assert berkeley.future_term_id() == '2188' assert berkeley.s3_canvas_data_path_current_term() == 'canvas-data/term/spring-2018'
def generate_merged_enrollment_terms(self, merged_profile, term_id=None): if term_id and term_id not in berkeley.reverse_term_ids(): return elif term_id: term_ids = [term_id] else: term_ids = berkeley.reverse_term_ids() uid = merged_profile.get('uid') sid = merged_profile.get('sid') canvas_user_id = merged_profile.get('canvasUserId') canvas_courses_feed = get_canvas_courses_feed(uid) merge_canvas_site_map(self.canvas_site_map, canvas_courses_feed) terms_feed = get_merged_enrollment_terms(uid, sid, term_ids, canvas_courses_feed, self.canvas_site_map) term_gpas = self.fetch_term_gpas(sid) relative_submission_counts = get_relative_submission_counts( canvas_user_id) for term_id in term_ids: app.logger.debug( f'Generating merged enrollment term (uid={uid}, sid={sid}, term_id={term_id})' ) ts = datetime.now().timestamp() term_feed = terms_feed.get(term_id) if term_feed and (len(term_feed['enrollments']) or len(term_feed['unmatchedCanvasSites'])): term_gpa = next( (t for t in term_gpas if t['term_id'] == term_id), None) if term_gpa: term_feed['termGpa'] = { 'gpa': float(term_gpa['gpa']), 'unitsTakenForGpa': float(term_gpa['units_taken_for_gpa']), } # Rebuild our Canvas courses list to remove any courses that were screened out during association (for instance, # dropped or athletic enrollments). canvas_courses = [] for enrollment in term_feed.get('enrollments', []): canvas_courses += enrollment['canvasSites'] canvas_courses += term_feed.get('unmatchedCanvasSites', []) # Decorate the Canvas courses list with per-course statistics and return summary statistics. app.logger.debug( f'Generating enrollment term analytics (uid={uid}, sid={sid}, term_id={term_id})' ) term_feed['analytics'] = mean_course_analytics_for_user( canvas_courses, canvas_user_id, relative_submission_counts, self.canvas_site_map, ) self.rows['student_enrollment_terms'].append('\t'.join( [str(sid), str(term_id), json.dumps(term_feed)])) app.logger.debug( f'Enrollment term merge complete (uid={uid}, sid={sid}, term_id={term_id}, ' f'{datetime.now().timestamp() - ts} seconds)')
def generate_student_enrollments_table(self, non_advisee_sids): # Split all S3/Redshift operations by term in hope of not overloading memory or other resources. # (Using finer-grained batches of SIDs would probably involve replacing the staging table by a Spectrum # external table.) total_count = 0 table_name = 'student_enrollment_terms_hist_enr' student_schema.truncate_staging_table(table_name) for term_id in reverse_term_ids(include_future_terms=True, include_legacy_terms=True): with tempfile.TemporaryFile() as feed_file: term_count = self.collect_merged_enrollments( non_advisee_sids, term_id, feed_file) if term_count: student_schema.write_file_to_staging( table_name, feed_file, term_count, term_id, ) if term_count: with redshift.transaction() as transaction: student_schema.refresh_from_staging( table_name, term_id, non_advisee_sids, transaction, ) total_count += term_count app.logger.info('Non-advisee term enrollment generation complete.') return total_count
def merge_canvas_analytics_for_term(self, term_id, feed_path): if term_id not in reverse_term_ids(): return {} canvas_site_map_path = feed_path + f'canvas_site_map_{term_id}.json' canvas_site_map = s3.get_object_json(canvas_site_map_path) if not canvas_site_map: raise BackgroundJobError( f'Failed to retrieve Canvas site map at {canvas_site_map_path}, aborting' ) return canvas_site_map
def deduplicate(prefix, s3list): filename_map = {} for s3obj in s3list: m = re.match(r'.+/(.+\.gz)', s3obj['Key']) if m: filename_map[m[1]] = s3obj for term_id in reverse_term_ids(include_future_terms=True): filename = f'{prefix}-{term_id}.gz' if filename not in filename_map: raise BackgroundJobError(f'Expected filename {filename} not found in S3, aborting') return list(filename_map.values())
def generate_feeds(self): # Translation between canvas_user_id and UID/SID is needed to merge Canvas analytics data and SIS enrollment-based data. advisees_by_canvas_id = {} advisees_by_sid = {} self.successes = [] self.failures = [] profile_tables = self.generate_student_profile_tables( advisees_by_canvas_id, advisees_by_sid) if not profile_tables: raise BackgroundJobError( 'Failed to generate student profile tables.') feed_path = app.config['LOCH_S3_BOAC_ANALYTICS_DATA_PATH'] + '/feeds/' s3.upload_json(advisees_by_canvas_id, feed_path + 'advisees_by_canvas_id.json') upload_student_term_maps(advisees_by_sid) # Avoid processing Canvas analytics data for future terms and pre-CS terms. for term_id in (future_term_ids() + legacy_term_ids()): enrollment_term_map = s3.get_object_json( feed_path + f'enrollment_term_map_{term_id}.json') if enrollment_term_map: GenerateMergedEnrollmentTerm().refresh_student_enrollment_term( term_id, enrollment_term_map) canvas_integrated_term_ids = reverse_term_ids() app.logger.info( f'Will queue analytics generation for {len(canvas_integrated_term_ids)} terms on worker nodes.' ) result = queue_merged_enrollment_term_jobs(self.job_id, canvas_integrated_term_ids) if not result: raise BackgroundJobError('Failed to queue enrollment term jobs.') refresh_all_from_staging(profile_tables) self.update_redshift_academic_standing() self.update_rds_profile_indexes() app.logger.info( 'Profile generation complete; waiting for enrollment term generation to finish.' ) while True: sleep(1) enrollment_results = get_merged_enrollment_term_job_status( self.job_id) if not enrollment_results: raise BackgroundJobError('Failed to refresh RDS indexes.') any_pending_job = next( (row for row in enrollment_results if row['status'] == 'created' or row['status'] == 'started'), None) if not any_pending_job: break app.logger.info('Exporting analytics data for archival purposes.') unload_enrollment_terms([current_term_id(), future_term_id()]) app.logger.info('Refreshing enrollment terms in RDS.') with rds.transaction() as transaction: if self.refresh_rds_enrollment_terms(None, transaction): transaction.commit() app.logger.info('Refreshed RDS enrollment terms.') else: transaction.rollback() raise BackgroundJobError( 'Failed to refresh RDS enrollment terms.') status_string = f'Generated merged profiles ({len(self.successes)} successes, {len(self.failures)} failures).' errored = False for row in enrollment_results: status_string += f" {row['details']}" if row['status'] == 'error': errored = True truncate_staging_table('student_enrollment_terms') if errored: raise BackgroundJobError(status_string) else: return status_string
def run(self, term_ids=None): if not term_ids: term_ids = reverse_term_ids() app.logger.info( f'Starting SIS terms API import job for {len(term_ids)} terms...') rows = [] success_count = 0 failure_count = 0 index = 1 for term_id in term_ids: app.logger.info( f'Fetching SIS terms API for term id {term_id} ({index} of {len(term_ids)})' ) feed = sis_terms_api.get_term(term_id) if feed: success_count += 1 for academic_career_term in feed: for session in academic_career_term.get('sessions', []): rows.append( '\t'.join([ academic_career_term.get('id', ''), academic_career_term.get('name', ''), academic_career_term.get('academicCareer', {}).get('code', ''), academic_career_term.get('beginDate', ''), academic_career_term.get('endDate', ''), session.get('id', ''), session.get('name', ''), session.get('beginDate', ''), session.get('endDate', ''), ]), ) else: failure_count += 1 app.logger.error( f'SIS terms API import failed for term id {term_id}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/terms.tsv' app.logger.info( f'Will stash {len(rows)} rows from {success_count} feeds in S3: {s3_key}' ) if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') with redshift.transaction() as transaction: if self.update_redshift(term_ids, transaction): transaction.commit() app.logger.info('Updated Redshift.') else: transaction.rollback() app.logger.error('Failed to update Redshift.') return False with rds.transaction() as transaction: if self.update_rds(rows, term_ids, transaction): transaction.commit() app.logger.info('Updated RDS.') else: transaction.rollback() app.logger.error('Failed to update RDS.') return False return f'SIS terms API import job completed: {success_count} succeeded, {failure_count} failed.'