def test_server_error(self, app, caplog): """Logs unexpected server errors.""" with capture_app_logs(app): canvas_error = MockResponse(429, {}, '{"message": "Easy, tiger."}') with register_mock(canvas_data.get_snapshots, canvas_error): response = canvas_data.get_snapshots() assert '429 Client Error: Too Many Requests' in caplog.text assert not response
def test_get_snapshots(self, app): """Returns fixture data.""" response = canvas_data.get_snapshots() assert response['incomplete'] is False assert response['schemaVersion'] == '2.0.0' assert len(response['files']) == 348 assert response['files'][0][ 'filename'] == 'account_dim-00000-5eb7ee9e.gz' assert response['files'][0]['table'] == 'account_dim' assert response['files'][0]['partial'] is False assert response['files'][0]['url'].startswith( 'https://hosted-data-work.s3.amazonaws.com/20180320T160000.415/dw_split/12345600000054321' )
def test_resync_canvas_snapshots(self, app, metadata_db, caplog): """Dispatches a complete resync job against fixtures.""" caplog.set_level(logging.INFO) snapshots = canvas_data.get_snapshots()['files'] def mock_metadata(job_id, snapshot, status, destination_size): metadata.create_canvas_sync_status(job_id, snapshot['filename'], snapshot['table'], snapshot['url']) key = '/'.join([ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ]) metadata.update_canvas_sync_status( job_id, key, status, source_size=1048576, destination_size=destination_size) old_sync_job = 'sync_152550000' latest_sync_job = 'sync_152560000' # The older job should be ignored by the resync. for snapshot in snapshots[0:5]: mock_metadata(old_sync_job, snapshot, 'complete', 1048576) for snapshot in snapshots[5:10]: mock_metadata(old_sync_job, snapshot, 'error', None) # The latest job synced five files successfully and ran into three problems. for snapshot in snapshots[10:15]: mock_metadata(latest_sync_job, snapshot, 'complete', 1048576) stalled = snapshots[15] errored = snapshots[16] size_discrepancy = snapshots[17] mock_metadata(latest_sync_job, stalled, 'streaming', None) mock_metadata(latest_sync_job, errored, 'error', None) mock_metadata(latest_sync_job, size_discrepancy, 'complete', 65536) schema = app.config['RDS_SCHEMA_METADATA'] with capture_app_logs(app): assert rds.fetch( f'SELECT count(*) FROM {schema}.canvas_sync_job_status' )[0]['count'] == 18 with mock_s3(app): result = ResyncCanvasSnapshots().run_wrapped() assert 'Canvas snapshot resync job dispatched to workers' in result assert_background_job_status('resync') assert f"Dispatched S3 resync of snapshot {stalled['filename']}" in caplog.text assert f"Dispatched S3 resync of snapshot {errored['filename']}" in caplog.text assert f"Dispatched S3 resync of snapshot {size_discrepancy['filename']}" in caplog.text assert '3 successful dispatches, 0 failures' in caplog.text assert rds.fetch( f'SELECT count(*) FROM {schema}.canvas_sync_job_status' )[0]['count'] == 21 resync_results = rds.fetch( f"SELECT * FROM {schema}.canvas_sync_job_status WHERE job_id LIKE 'resync%'" ) assert len(resync_results) == 3 urls = [] for r in resync_results: assert r['job_id'].startswith('resync_') assert r['filename'] assert r['canvas_table'] assert r['created_at'] assert r['updated_at'] urls.append(r['source_url']) assert stalled['url'] in urls assert errored['url'] in urls assert size_discrepancy['url'] in urls
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): return snapshot[ 'table'] == 'requests' and snapshot['partial'] is False snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) key_components = [ app.config['LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'], snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = app.config[ 'LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'] + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): # For tables other than requests, sync all snapshots. # For the requests table, sync snapshots that are partial or later than the configured cutoff date. def after_cutoff_date(url): match = re.search('requests/(20\d{6})', url) return match is not None and ( match[1] >= app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE']) return snapshot['table'] != 'requests' or snapshot[ 'partial'] is True or after_cutoff_date(snapshot['url']) snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) if snapshot['table'] == 'requests': key_components = [ berkeley.s3_canvas_data_path_current_term(), snapshot['table'], snapshot['filename'] ] else: key_components = [ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = berkeley.s3_canvas_data_path_current_term( ) + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'