def test_sync_canvas_snapshots(self, app, metadata_db, caplog): """Dispatches a complete sync job against fixtures.""" caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app): result = SyncCanvasSnapshots().run_wrapped() assert 'Canvas snapshot sync job dispatched to workers' in result assert_background_job_status('sync') assert 'Dispatched S3 sync of snapshot quiz_dim-00000-0ab80c7c.gz' in caplog.text assert 'Dispatched S3 sync of snapshot requests-00098-b14782f5.gz' in caplog.text assert '311 successful dispatches, 0 failures' in caplog.text schema = app.config['RDS_SCHEMA_METADATA'] count_results = rds.fetch(f'SELECT count(*) FROM {schema}.canvas_sync_job_status') assert count_results[0]['count'] == 311 canvas_status_results = rds.fetch(f'SELECT DISTINCT status FROM {schema}.canvas_sync_job_status') assert len(canvas_status_results) == 1 assert canvas_status_results[0]['status'] == 'created' sync_results = rds.fetch(f'SELECT * FROM {schema}.canvas_sync_job_status LIMIT 1') assert sync_results[0]['job_id'].startswith('sync_') assert sync_results[0]['filename'] == 'account_dim-00000-5eb7ee9e.gz' assert sync_results[0]['canvas_table'] == 'account_dim' assert 'account_dim/part-00505-5c40f1f3-b611-4f64-a007-67b775e984fe.c000.txt.gz' in sync_results[0]['source_url'] assert sync_results[0]['destination_url'] is None assert sync_results[0]['details'] is None assert sync_results[0]['created_at'] assert sync_results[0]['updated_at']
def test_canvas_sync_metadata(self, app, metadata_db): """When given a job id, updates metadata on file sync.""" url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key = 'canvas/sonnet_submission_dim/sonnet-xlv.txt' with mock_s3(app): with open(_get_fixtures_path() + '/sonnet_xlv.html', 'r') as file: responses.add(responses.GET, url, body=file.read(), headers={'Content-Length': '767'}) # Run two successive sync jobs on the same file. The first succeeds, the second is skipped as # a duplicate. metadata.create_canvas_sync_status('job_1', 'sonnet-xlv.txt', 'sonnet_submission_dim', url) result = SyncFileToS3().run(url=url, key=key, canvas_sync_job_id='job_1') assert result is True metadata.create_canvas_sync_status('job_2', 'sonnet-xlv.txt', 'sonnet_submission_dim', url) result = SyncFileToS3().run(url=url, key=key, canvas_sync_job_id='job_2') assert result is False schema = app.config['RDS_SCHEMA_METADATA'] sync_metadata = rds.fetch( f'SELECT * FROM {schema}.canvas_sync_job_status') snapshot_metadata = rds.fetch( f'SELECT * FROM {schema}.canvas_synced_snapshots') assert len(sync_metadata) == 2 assert sync_metadata[0]['job_id'] == 'job_1' assert sync_metadata[0][ 'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert sync_metadata[0]['status'] == 'complete' assert sync_metadata[0]['source_size'] == 767 assert sync_metadata[0]['destination_size'] == 767 assert sync_metadata[0]['updated_at'] > sync_metadata[0][ 'created_at'] assert sync_metadata[1]['job_id'] == 'job_2' assert sync_metadata[1][ 'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert sync_metadata[1]['status'] == 'duplicate' assert sync_metadata[1]['source_size'] is None assert sync_metadata[1]['destination_size'] is None assert sync_metadata[1]['updated_at'] > sync_metadata[1][ 'created_at'] assert len(snapshot_metadata) == 1 assert snapshot_metadata[0]['filename'] == 'sonnet-xlv.txt' assert snapshot_metadata[0][ 'canvas_table'] == 'sonnet_submission_dim' assert snapshot_metadata[0][ 'url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert snapshot_metadata[0]['size'] == 767 assert snapshot_metadata[0]['created_at'] assert snapshot_metadata[0]['deleted_at'] is None
def test_import_student_photos(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_student_photos import ImportStudentPhotos caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): result = ImportStudentPhotos().run_wrapped() assert result == 'Student photo import completed: 1 succeeded, 9 had no photo available, 0 failed.' response = s3.get_keys_with_prefix('cal1card-data/photos') assert len(response) == 1 assert response[0] == 'cal1card-data/photos/61889.jpg' success_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'success'" ) assert len(success_rows) == 1 assert success_rows[0]['sid'] == '11667051' failure_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'failure'" ) assert len(failure_rows) == 0 not_found_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'photo_not_found'" ) assert len(not_found_rows) == 9
def test_run(self, app, metadata_db): """Uploads Canvas grade change logs to S3, then stores feeds in Redshift.""" with mock_s3(app): with override_config(app, 'TEST_CANVAS_COURSE_IDS', [1492459, 1488704, 1491827]): result = ImportCanvasGradeChangeLog().run_wrapped() assert result assert 'Canvas grade change log import completed for term 2178: 3 succeeded, ' in result assert '0 failed.' in result assert_background_job_status('ImportCanvasGradeChangeLog') schema = app.config['RDS_SCHEMA_METADATA'] count_results = rds.fetch( f'SELECT count(*) FROM {schema}.canvas_api_import_job_status') assert count_results[0]['count'] == 3 canvas_status_results = rds.fetch( f'SELECT DISTINCT status FROM {schema}.canvas_api_import_job_status' ) assert len(canvas_status_results) == 1 assert canvas_status_results[0]['status'] == 'created' sync_results = rds.fetch( f'SELECT * FROM {schema}.canvas_api_import_job_status LIMIT 1') assert sync_results[0]['job_id'].startswith( 'ImportCanvasGradeChangeLog_') assert sync_results[0]['course_id'] == '1492459' assert sync_results[0]['table_name'] == 'grade_change_log' assert sync_results[0]['details'] is None assert sync_results[0]['created_at'] assert sync_results[0]['updated_at']
def get_failures_from_last_sync(): last_job_id = None failures = [] job_id_result = rds.fetch( f"""SELECT MAX(job_id) AS last_job_id FROM {_rds_schema()}.canvas_sync_job_status WHERE job_id LIKE %s""", params=['sync%%'], ) if not job_id_result: app.logger.error('Failed to retrieve id for last sync job') else: last_job_id = job_id_result[0]['last_job_id'] failures_query = f"""SELECT * FROM {_rds_schema()}.canvas_sync_job_status WHERE job_id = %s AND (status NOT IN ('complete', 'duplicate') OR destination_size != source_size)""" failures = rds.fetch(failures_query, params=[last_job_id]) return {'job_id': last_job_id, 'failures': failures}
def _success_history_after_batch_import(): result = ImportRegistrations().run_wrapped(load_mode='batch') assert result == 'Registrations import completed: 1 succeeded, 8 failed.' rows = rds.fetch("SELECT * FROM nessie_metadata_test.registration_import_status WHERE status = 'success' ORDER BY updated_at") assert len(rows) == 2 assert rows[0]['updated_at'] < rows[1]['updated_at'] return (rows[0]['sid'], rows[1]['sid'])
def get_active_sids_with_oldest_registration_imports(limit): active_sids = [r['sid'] for r in get_all_student_ids()] sql = f"""SELECT sid FROM {metadata_schema()}.registration_import_status WHERE sid = ANY(%s) AND status = 'success' ORDER BY updated_at LIMIT %s""" return rds.fetch(sql, params=(active_sids, limit))
def refresh_term_index(self, app): from nessie.jobs.create_sis_terms_schema import CreateSisTermsSchema CreateSisTermsSchema().refresh_current_term_index() rds_schema = app.config['RDS_SCHEMA_SIS_TERMS'] rows = rds.fetch(f'SELECT * FROM {rds_schema}.current_term_index') assert len(rows) == 1 return rows[0]
def import_appointment_advisors(self): sis_notes_schema = app.config['RDS_SCHEMA_SIS_ADVISING_NOTES'] advisor_schema_redshift = app.config['REDSHIFT_SCHEMA_ADVISOR_INTERNAL'] advisor_sids_from_sis_appointments = set( [r['advisor_sid'] for r in rds.fetch(f'SELECT DISTINCT advisor_sid FROM {sis_notes_schema}.advising_appointments')], ) advisor_sids_from_advisors = set( [r['sid'] for r in redshift.fetch(f'SELECT DISTINCT sid FROM {advisor_schema_redshift}.advisor_departments')], ) advisor_sids = list(advisor_sids_from_sis_appointments | advisor_sids_from_advisors) advisor_attributes = calnet.client(app).search_csids(advisor_sids) if not advisor_attributes: raise BackgroundJobError('Failed to fetch note author attributes.') unique_advisor_attributes = list({adv['uid']: adv for adv in advisor_attributes}.values()) with rds.transaction() as transaction: insertable_rows = [] for entry in unique_advisor_attributes: first_name, last_name = calnet.split_sortable_name(entry) insertable_rows.append(tuple((entry.get('uid'), entry.get('csid'), first_name, last_name))) result = transaction.insert_bulk( f'INSERT INTO {sis_notes_schema}.advising_appointment_advisors (uid, sid, first_name, last_name) VALUES %s', insertable_rows, ) if result: transaction.commit() app.logger.info('Imported appointment advisor attributes.') else: transaction.rollback() raise BackgroundJobError('Failed to import appointment advisor attributes.')
def _advisor_attributes_by_uid(self): asc_schema = app.config['RDS_SCHEMA_ASC'] e_i_schema = app.config['RDS_SCHEMA_E_I'] advisor_uids_from_asc_notes = set([ r['advisor_uid'] for r in rds.fetch( f'SELECT DISTINCT advisor_uid FROM {asc_schema}.advising_notes' ) ], ) advisor_uids_from_e_i_notes = set([ r['advisor_uid'] for r in rds.fetch( f'SELECT DISTINCT advisor_uid FROM {e_i_schema}.advising_notes' ) ], ) advisor_uids = list(advisor_uids_from_asc_notes | advisor_uids_from_e_i_notes) return calnet.client(app).search_uids(advisor_uids)
def _advisor_attributes_by_email(self): data_science_schema = app.config['RDS_SCHEMA_DATA_SCIENCE'] sql = f""" SELECT DISTINCT advisor_email FROM {data_science_schema}.advising_notes WHERE advisor_email IS NOT NULL """ advisor_emails = set([r['advisor_email'] for r in rds.fetch(sql)]) return calnet.client(app).search_emails(list(advisor_emails))
def assert_background_job_status(prefix): from flask import current_app as app schema = app.config['RDS_SCHEMA_METADATA'] background_job_status_results = rds.fetch(f'SELECT * FROM {schema}.background_job_status') assert len(background_job_status_results) == 1 assert background_job_status_results[0]['job_id'].startswith(f'{prefix}_') assert background_job_status_results[0]['status'] == 'succeeded' assert background_job_status_results[0]['created_at'] assert background_job_status_results[0]['updated_at'] > background_job_status_results[0]['created_at']
def get_sis_current_term(self, for_date): rows = rds.fetch( f"""SELECT *, DATE(term_ends + INTERVAL '10 DAYS') AS grace_period_ends FROM {self.rds_schema}.term_definitions WHERE DATE(term_ends + INTERVAL '10 DAYS') >= '{for_date}' ORDER BY term_id ASC LIMIT 2""", ) if rows: return rows[1] if ( for_date >= rows[1]['term_begins'] or for_date > rows[0]['grace_period_ends']) else rows[0]
def get_merged_enrollment_term_job_status(master_job_id): return rds.fetch( f"""SELECT * FROM {_rds_schema()}.merged_enrollment_term_job_queue WHERE master_job_id=%s ORDER BY term_id """, params=(master_job_id, ), log_query=False, )
def most_recent_background_job_status(job_id_prefix, status=None): sql = f'SELECT * FROM {_rds_schema()}.background_job_status WHERE job_id LIKE %s' params = [f'{job_id_prefix}%'] if status: sql += ' AND status = %s' params += [status] sql += ' ORDER BY updated_at DESC LIMIT 1' result = rds.fetch( sql, params, ) if result and result[0]: return result[0]
def test_metadata_tracked(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_registrations import ImportRegistrations rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status') assert len(rows) == 0 caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): ImportRegistrations().run_wrapped() rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status') assert len(rows) == 10 assert len([r for r in rows if r['status'] == 'failure']) == 8 assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success' result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 0 succeeded, 8 failed.' result = ImportRegistrations().run_wrapped(load_mode='all') assert result == 'Registrations import completed: 2 succeeded, 8 failed.' rds.execute("DELETE FROM nessie_metadata_test.registration_import_status WHERE sid = '11667051'") result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 1 succeeded, 8 failed.' assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success' rds.execute("UPDATE nessie_metadata_test.registration_import_status SET status='failure' WHERE sid = '11667051'") result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 1 succeeded, 8 failed.' assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success'
def test_refresh_current_term_index(self, mock_datetime, app, term_definitions): mock_datetime.now.return_value = datetime(year=2018, month=5, day=1, hour=5, minute=21) from nessie.jobs.create_sis_terms_schema import CreateSisTermsSchema CreateSisTermsSchema().refresh_current_term_index() rds_schema = app.config['RDS_SCHEMA_SIS_TERMS'] rows = rds.fetch(f'SELECT * FROM {rds_schema}.current_term_index') assert len(rows) == 1 assert rows[0]['current_term_name'] == 'Spring 2018' assert rows[0]['future_term_name'] == 'Fall 2018'
def poll_merged_enrollment_term_job_queue(): result = rds.fetch( f"""UPDATE {_rds_schema()}.merged_enrollment_term_job_queue SET status='started', instance_id=%s WHERE id = ( SELECT id FROM {_rds_schema()}.merged_enrollment_term_job_queue WHERE status = 'created' LIMIT 1 FOR UPDATE SKIP LOCKED ) RETURNING id, master_job_id, term_id """, params=(_instance_id(), ), log_query=False, ) if result and result[0]: return result[0]
def _advisor_attributes_by_sid(self): sis_notes_schema = app.config['RDS_SCHEMA_SIS_ADVISING_NOTES'] advisor_schema_redshift = app.config[ 'REDSHIFT_SCHEMA_ADVISOR_INTERNAL'] advisor_sids_from_sis_notes = set([ r['advisor_sid'] for r in rds.fetch( f'SELECT DISTINCT advisor_sid FROM {sis_notes_schema}.advising_notes' ) ], ) advisor_sids_from_advisors = set([ r['sid'] for r in redshift.fetch( f'SELECT DISTINCT sid FROM {advisor_schema_redshift}.advisor_departments' ) ], ) advisor_sids = list(advisor_sids_from_sis_notes | advisor_sids_from_advisors) return calnet.client(app).search_csids(advisor_sids)
def test_import_registrations_batch_mode(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_registrations import ImportRegistrations with mock_s3(app): ImportRegistrations().run_wrapped() rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status') assert len(rows) == 10 with override_config(app, 'CYCLICAL_API_IMPORT_BATCH_SIZE', 9): def _success_history_after_batch_import(): result = ImportRegistrations().run_wrapped(load_mode='batch') assert result == 'Registrations import completed: 1 succeeded, 8 failed.' rows = rds.fetch("SELECT * FROM nessie_metadata_test.registration_import_status WHERE status = 'success' ORDER BY updated_at") assert len(rows) == 2 assert rows[0]['updated_at'] < rows[1]['updated_at'] return (rows[0]['sid'], rows[1]['sid']) sid_1, sid_2 = _success_history_after_batch_import() assert _success_history_after_batch_import() == (sid_2, sid_1) assert _success_history_after_batch_import() == (sid_1, sid_2)
def get_advisee_sids_with_photos(): sql = f"""SELECT sid FROM {metadata_schema()}.photo_import_status WHERE status = 'success'""" return rds.fetch(sql)
def get_sis_term_for_id(self, term_id): sql = f"SELECT * FROM {rds_schema}.term_definitions WHERE term_id = '{term_id}' LIMIT 1" rows = rds.fetch(sql) return rows and rows[0]
def get_sis_current_term(self, for_date): sql = f"SELECT * FROM {rds_schema}.term_definitions WHERE term_ends > '{for_date}' ORDER BY term_id ASC LIMIT 1" rows = rds.fetch(sql) return rows and rows[0]
def test_resync_canvas_snapshots(self, app, metadata_db, caplog): """Dispatches a complete resync job against fixtures.""" caplog.set_level(logging.INFO) snapshots = canvas_data.get_snapshots()['files'] def mock_metadata(job_id, snapshot, status, destination_size): metadata.create_canvas_sync_status(job_id, snapshot['filename'], snapshot['table'], snapshot['url']) key = '/'.join([ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ]) metadata.update_canvas_sync_status( job_id, key, status, source_size=1048576, destination_size=destination_size) old_sync_job = 'sync_152550000' latest_sync_job = 'sync_152560000' # The older job should be ignored by the resync. for snapshot in snapshots[0:5]: mock_metadata(old_sync_job, snapshot, 'complete', 1048576) for snapshot in snapshots[5:10]: mock_metadata(old_sync_job, snapshot, 'error', None) # The latest job synced five files successfully and ran into three problems. for snapshot in snapshots[10:15]: mock_metadata(latest_sync_job, snapshot, 'complete', 1048576) stalled = snapshots[15] errored = snapshots[16] size_discrepancy = snapshots[17] mock_metadata(latest_sync_job, stalled, 'streaming', None) mock_metadata(latest_sync_job, errored, 'error', None) mock_metadata(latest_sync_job, size_discrepancy, 'complete', 65536) schema = app.config['RDS_SCHEMA_METADATA'] with capture_app_logs(app): assert rds.fetch( f'SELECT count(*) FROM {schema}.canvas_sync_job_status' )[0]['count'] == 18 with mock_s3(app): result = ResyncCanvasSnapshots().run_wrapped() assert 'Canvas snapshot resync job dispatched to workers' in result assert_background_job_status('resync') assert f"Dispatched S3 resync of snapshot {stalled['filename']}" in caplog.text assert f"Dispatched S3 resync of snapshot {errored['filename']}" in caplog.text assert f"Dispatched S3 resync of snapshot {size_discrepancy['filename']}" in caplog.text assert '3 successful dispatches, 0 failures' in caplog.text assert rds.fetch( f'SELECT count(*) FROM {schema}.canvas_sync_job_status' )[0]['count'] == 21 resync_results = rds.fetch( f"SELECT * FROM {schema}.canvas_sync_job_status WHERE job_id LIKE 'resync%'" ) assert len(resync_results) == 3 urls = [] for r in resync_results: assert r['job_id'].startswith('resync_') assert r['filename'] assert r['canvas_table'] assert r['created_at'] assert r['updated_at'] urls.append(r['source_url']) assert stalled['url'] in urls assert errored['url'] in urls assert size_discrepancy['url'] in urls
def get_current_term_index(): rds_terms_schema = app.config['RDS_SCHEMA_TERMS'] sql = f'SELECT * FROM {rds_terms_schema}.current_term_index LIMIT 1' rows = rds.fetch(sql) return rows and rows[0]
def get_sids_with_registration_imports(): sql = f"""SELECT sid FROM {metadata_schema()}.registration_import_status WHERE status = 'success'""" return rds.fetch(sql)
def background_job_status_by_date(created_date): sql = f'SELECT * FROM {_rds_schema()}.background_job_status WHERE cast(created_at as date) = %s' return rds.fetch( sql, params=[created_date.strftime('%Y-%m-%d')], )