def test_remove_obsolete_files(self, app, caplog, cleanup_s3): """Removes files from S3 following prefix and whitelist rules.""" caplog.set_level(logging.INFO) with capture_app_logs(app): prefix1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/001' prefix2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/002' assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XX.html', prefix1 + '/xx/sonnet-xx.html') assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XXI.html', prefix1 + '/xxi/sonnet-xxi.html') assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XXII.html', prefix1 + '/xxii/sonnet-xxii.html') assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html', prefix2 + '/xlv/sonnet-xlv.html') whitelist = ['sonnet-xxi.html', 'sonnet-xxii.html'] assert s3.delete_objects_with_prefix(prefix1, whitelist) is True assert f'3 key(s) matching prefix "{prefix1}"' in caplog.text assert '2 key(s) in whitelist' in caplog.text assert 'will delete 1 object(s)' in caplog.text assert s3.object_exists(prefix1 + '/xx/sonnet-xx.html') is False assert s3.object_exists(prefix1 + '/xxi/sonnet-xxi.html') is True assert s3.object_exists(prefix1 + '/xxii/sonnet-xxii.html') is True assert s3.object_exists(prefix2 + '/xlv/sonnet-xlv.html') is True
def test_file_upload_and_delete(self, app, cleanup_s3): """Can upload and delete files in S3.""" url1 = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html' url2 = 'http://shakespeare.mit.edu/Poetry/sonnet.LXII.html' key2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002/sonnet-xlii.html' assert s3.object_exists(key1) is False assert s3.upload_from_url(url1, key1)['ContentLength'] == 767 assert s3.object_exists(key1) is True assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001') == [key1] assert s3.object_exists(key2) is False assert s3.upload_from_url(url2, key2)['ContentLength'] == 743 assert s3.object_exists(key2) is True assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002') == [key2] client = s3.get_client() contents1 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'], Key=key1)['Body'].read().decode('utf-8') assert 'These present-absent with swift motion slide' in contents1 contents2 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'], Key=key2)['Body'].read().decode('utf-8') assert 'Beated and chopp\'d with tann\'d antiquity' in contents2
def test_s3_upload_error_handling(self, app, caplog, bad_bucket): """Handles and logs connection errors on S3 upload.""" with capture_app_logs(app): url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key = app.config[ 'LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html' with pytest.raises(ValueError): s3.upload_from_url(url, key) assert 'Error on S3 upload' in caplog.text assert 'the bucket \'not-a-bucket-nohow\' does not exist, or is forbidden for access' in caplog.text
def test_source_url_error_handling(self, app, caplog): """Handles and logs connection errors to source URL.""" with capture_app_logs(app): url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key = app.config[ 'LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html' responses.add(responses.GET, url, status=500, body='{"message": "Internal server error."}') with pytest.raises(ConnectionError): s3.upload_from_url(url, key) assert 'Received unexpected status code, aborting S3 upload' in caplog.text assert 'status=500' in caplog.text assert 'body={"message": "Internal server error."}' in caplog.text assert f'url={url}' in caplog.text assert f'key={key}' in caplog.text
def run(self, url, key, canvas_sync_job_id=None): if canvas_sync_job_id: update_canvas_sync_status(canvas_sync_job_id, key, 'started') if s3.object_exists(key): app.logger.info(f'Key {key} exists, skipping upload') if canvas_sync_job_id: update_canvas_sync_status(canvas_sync_job_id, key, 'duplicate') return False else: app.logger.info(f'Key {key} does not exist, starting upload') try: def update_streaming_status(headers): update_canvas_sync_status( canvas_sync_job_id, key, 'streaming', source_size=headers.get('Content-Length')) response = s3.upload_from_url( url, key, on_stream_opened=update_streaming_status) if response and canvas_sync_job_id: destination_size = response.get('ContentLength') update_canvas_sync_status( canvas_sync_job_id, key, 'complete', destination_size=destination_size) create_canvas_snapshot(key, size=destination_size) return True except (ClientError, ConnectionError, ValueError) as e: if canvas_sync_job_id: update_canvas_sync_status(canvas_sync_job_id, key, 'error', details=str(e)) return False
def process_archives(self, frequency, datestamp, job_id): s3_key = app.config['LOCH_S3_PIAZZA_DATA_PATH'] self.sid = app.config['PIAZZA_API_SID'] # this is Piazza school ID for Berkeley self.session_id = app.config['PIAZZA_API_SESSIONID'] # a 'random string' but we still are getting it from config self.headers = { 'Content-Type': 'application/json', 'CSRF-Token': self.session_id, } try: list_of_archives = self.get_list_of_archives(self.headers) archives_to_process = self.select_archives_by_type_and_date(list_of_archives, frequency, datestamp) if not archives_to_process: app.logger.debug(f'{frequency}/{datestamp}: no archives found for these criteria') return f'{frequency}/{datestamp}: no archives found for these criteria' for file_number, archive_file in enumerate(archives_to_process): download_url = self.piazza_api('school.generate_url', self.headers, {'sid': self.sid, 'name': archive_file['name']}) download_url = download_url.text download_url = json.loads(download_url)['result'] app.logger.debug('Download URL: ' + download_url) piazza_file_name = archive_file['name'] # piazza_file_name is like 'daily_2020-08-14.zip' or 'full_2020-08-14.zip' # in s3 it will end up in e.g. .../piazza-data/daily/2020/08/14/daily_2020-08-14.zip parts = '/'.join(split('[\._\-]', piazza_file_name)[0:4]) s3_file = f'{s3_key}/{parts}/{piazza_file_name}.zip' def update_streaming_status(headers): update_background_job_status(job_id, 'streaming', details=f"{s3_file}, size={headers.get('Content-Length')}") response = s3.upload_from_url(download_url, s3_file, on_stream_opened=update_streaming_status) if response and job_id: destination_size = response.get('ContentLength') update_background_job_status(job_id, 'stream complete', details=f'{s3_file}, stream complete, size={destination_size}') except Exception as e: # let the people upstairs know, they're in charge raise e return ', '.join(f"{a['name']}: {a['size']} bytes" for a in archives_to_process)