def merge_analytics_data_for_term(self, term_id): feed_path = app.config['LOCH_S3_BOAC_ANALYTICS_DATA_PATH'] + '/feeds/' advisees_by_canvas_id_path = feed_path + 'advisees_by_canvas_id.json' advisees_by_canvas_id = s3.get_object_json(advisees_by_canvas_id_path) if not advisees_by_canvas_id: raise BackgroundJobError( f'Failed to retrieve advisee map at {advisees_by_canvas_id_path}, aborting' ) enrollment_term_map_path = feed_path + f'enrollment_term_map_{term_id}.json' enrollment_term_map = s3.get_object_json(enrollment_term_map_path) if not enrollment_term_map: raise BackgroundJobError( f'Failed to retrieve enrollment term map at {enrollment_term_map_path}, aborting' ) canvas_site_map = self.merge_canvas_analytics_for_term( term_id, feed_path) self.merge_course_analytics_for_term(term_id, canvas_site_map, enrollment_term_map, advisees_by_canvas_id) self.merge_advisee_assignment_submissions_for_term( term_id, enrollment_term_map, advisees_by_canvas_id) return enrollment_term_map
def create_schema(self): base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH'] external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) # Flatten E&I-sourced JSON files into two schema-friendly JSON files. notes = [] topics = [] for key in s3.get_keys_with_prefix(base_s3_key): if key.endswith('.json'): notes_json = s3.get_object_json(key) if notes_json and 'notes' in notes_json: notes += notes_json['notes'] for note in notes: topics += _extract_topics(note) if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \ and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'): # Create schema app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('E&I Advising Notes schema creation job failed.') else: raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
def merge_canvas_analytics_for_term(self, term_id, feed_path): if term_id not in reverse_term_ids(): return {} canvas_site_map_path = feed_path + f'canvas_site_map_{term_id}.json' canvas_site_map = s3.get_object_json(canvas_site_map_path) if not canvas_site_map: raise BackgroundJobError( f'Failed to retrieve Canvas site map at {canvas_site_map_path}, aborting' ) return canvas_site_map
def transform(self, s3_source, s3_dest, key=None): objects = s3.get_keys_with_prefix(s3_source) app.logger.info( f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.' ) skip_count = 0 for o in objects: file_name = o.split('/')[-1] if s3.object_exists(f'{s3_dest}/{file_name}'): skip_count += 1 continue canvas_api_data = s3.get_object_json(o).get( key) if key else s3.get_object_json(o) with tempfile.TemporaryFile() as result: course_id = int(file_name.split('_')[-2]) for record in canvas_api_data: record['course_id'] = course_id result.write(json.dumps(record).encode() + b'\n') s3.upload_file(result, f'{s3_dest}/{file_name}') app.logger.info( f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.' )
def generate_feeds(self): # Translation between canvas_user_id and UID/SID is needed to merge Canvas analytics data and SIS enrollment-based data. advisees_by_canvas_id = {} advisees_by_sid = {} self.successes = [] self.failures = [] profile_tables = self.generate_student_profile_tables( advisees_by_canvas_id, advisees_by_sid) if not profile_tables: raise BackgroundJobError( 'Failed to generate student profile tables.') feed_path = app.config['LOCH_S3_BOAC_ANALYTICS_DATA_PATH'] + '/feeds/' s3.upload_json(advisees_by_canvas_id, feed_path + 'advisees_by_canvas_id.json') upload_student_term_maps(advisees_by_sid) # Avoid processing Canvas analytics data for future terms and pre-CS terms. for term_id in (future_term_ids() + legacy_term_ids()): enrollment_term_map = s3.get_object_json( feed_path + f'enrollment_term_map_{term_id}.json') if enrollment_term_map: GenerateMergedEnrollmentTerm().refresh_student_enrollment_term( term_id, enrollment_term_map) canvas_integrated_term_ids = reverse_term_ids() app.logger.info( f'Will queue analytics generation for {len(canvas_integrated_term_ids)} terms on worker nodes.' ) result = queue_merged_enrollment_term_jobs(self.job_id, canvas_integrated_term_ids) if not result: raise BackgroundJobError('Failed to queue enrollment term jobs.') refresh_all_from_staging(profile_tables) self.update_redshift_academic_standing() self.update_rds_profile_indexes() app.logger.info( 'Profile generation complete; waiting for enrollment term generation to finish.' ) while True: sleep(1) enrollment_results = get_merged_enrollment_term_job_status( self.job_id) if not enrollment_results: raise BackgroundJobError('Failed to refresh RDS indexes.') any_pending_job = next( (row for row in enrollment_results if row['status'] == 'created' or row['status'] == 'started'), None) if not any_pending_job: break app.logger.info('Exporting analytics data for archival purposes.') unload_enrollment_terms([current_term_id(), future_term_id()]) app.logger.info('Refreshing enrollment terms in RDS.') with rds.transaction() as transaction: if self.refresh_rds_enrollment_terms(None, transaction): transaction.commit() app.logger.info('Refreshed RDS enrollment terms.') else: transaction.rollback() raise BackgroundJobError( 'Failed to refresh RDS enrollment terms.') status_string = f'Generated merged profiles ({len(self.successes)} successes, {len(self.failures)} failures).' errored = False for row in enrollment_results: status_string += f" {row['details']}" if row['status'] == 'error': errored = True truncate_staging_table('student_enrollment_terms') if errored: raise BackgroundJobError(status_string) else: return status_string