def test_remove_obsolete_files(self, app, caplog, cleanup_s3): """Removes files from S3 following prefix and whitelist rules.""" caplog.set_level(logging.INFO) with capture_app_logs(app): prefix1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/001' prefix2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/002' assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XX.html', prefix1 + '/xx/sonnet-xx.html') assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XXI.html', prefix1 + '/xxi/sonnet-xxi.html') assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XXII.html', prefix1 + '/xxii/sonnet-xxii.html') assert s3.upload_from_url( 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html', prefix2 + '/xlv/sonnet-xlv.html') whitelist = ['sonnet-xxi.html', 'sonnet-xxii.html'] assert s3.delete_objects_with_prefix(prefix1, whitelist) is True assert f'3 key(s) matching prefix "{prefix1}"' in caplog.text assert '2 key(s) in whitelist' in caplog.text assert 'will delete 1 object(s)' in caplog.text assert s3.object_exists(prefix1 + '/xx/sonnet-xx.html') is False assert s3.object_exists(prefix1 + '/xxi/sonnet-xxi.html') is True assert s3.object_exists(prefix1 + '/xxii/sonnet-xxii.html') is True assert s3.object_exists(prefix2 + '/xlv/sonnet-xlv.html') is True
def test_file_upload_and_delete(self, app, cleanup_s3): """Can upload and delete files in S3.""" url1 = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html' url2 = 'http://shakespeare.mit.edu/Poetry/sonnet.LXII.html' key2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002/sonnet-xlii.html' assert s3.object_exists(key1) is False assert s3.upload_from_url(url1, key1)['ContentLength'] == 767 assert s3.object_exists(key1) is True assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001') == [key1] assert s3.object_exists(key2) is False assert s3.upload_from_url(url2, key2)['ContentLength'] == 743 assert s3.object_exists(key2) is True assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002') == [key2] client = s3.get_client() contents1 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'], Key=key1)['Body'].read().decode('utf-8') assert 'These present-absent with swift motion slide' in contents1 contents2 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'], Key=key2)['Body'].read().decode('utf-8') assert 'Beated and chopp\'d with tann\'d antiquity' in contents2
def test_s3_nonexistent_object(self, app, caplog, bad_bucket): """Returns false on S3 checks for nonexistent objects.""" with capture_app_logs(app): key = app.config[ 'LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html' response = s3.object_exists(key) assert response is False
def transform(self, s3_source, s3_dest, job_id): objects = s3.get_keys_with_prefix(s3_source) if len(objects) == 0: message = f'Zero objects found in {s3_source}. Quitting.' app.logger.info(message) return message app.logger.info(f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.') objects_updated = 0 new_objects = 0 objects_in_error = 0 total_objects = 0 for o in objects: file_name = o.split('/')[-1] app.logger.debug(f'processing {file_name}') # file_name is like 'daily_2020-08-14.zip' piazza_zip_file = s3.get_object_compressed_text_reader(o) for subfile in piazza_zip_file.namelist(): if '.json' in subfile: try: json_file = subfile.split('/')[-1] course_id = subfile.split('/')[-2] file_type = json_file.split('_')[0] record = piazza_zip_file.read(subfile) with tempfile.TemporaryFile() as result: s3_object = f'{s3_dest}/{file_type}/{course_id}/{json_file}' if s3.object_exists(s3_object): objects_updated += 1 else: new_objects += 1 result.write(record) s3.upload_file(result, s3_object) total_objects += 1 # update job queue every 1000 files... if total_objects % 1000 == 0: message = f'{subfile}, {total_objects} so far; ' \ + f'{new_objects} new files; ' \ + f'{objects_updated} existing files. {objects_in_error} files in error' \ + f'({len(objects)} objects in all)' update_background_job_status(job_id, 'transforming', details=message) except Exception as e: app.logger.error(f'could not extract {subfile}') app.logger.error(e) objects_in_error += 1 else: # not a json file, so we skip it continue message = f'Transformed {len(objects)} input files; created {new_objects} new objects; '\ + f'updated {objects_updated} existing objects. {objects_in_error} objects in error' app.logger.info(message) return message
def test_canvas_sync_metadata(self, app, metadata_db): """Makes an API call and puts the result in S3.""" with mock_s3(app): bucket = app.config['LOCH_S3_BUCKET'] path = '/api/v1/audit/grade_change/courses/7654321' s3_key = f'{bucket}/grade_change_log/grade_change_log_7654321' result = ImportCanvasApiData().run_wrapped( course_id='7654321', path=path, s3_key=s3_key, job_id='ImportCanvasGradeChangeLog_123', ) assert result is True assert s3.object_exists(f'{s3_key}_0.json') is True
def transform(self, s3_source, s3_dest, key=None): objects = s3.get_keys_with_prefix(s3_source) app.logger.info( f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.' ) skip_count = 0 for o in objects: file_name = o.split('/')[-1] if s3.object_exists(f'{s3_dest}/{file_name}'): skip_count += 1 continue canvas_api_data = s3.get_object_json(o).get( key) if key else s3.get_object_json(o) with tempfile.TemporaryFile() as result: course_id = int(file_name.split('_')[-2]) for record in canvas_api_data: record['course_id'] = course_id result.write(json.dumps(record).encode() + b'\n') s3.upload_file(result, f'{s3_dest}/{file_name}') app.logger.info( f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.' )
def run(self, url, key, canvas_sync_job_id=None): if canvas_sync_job_id: update_canvas_sync_status(canvas_sync_job_id, key, 'started') if s3.object_exists(key): app.logger.info(f'Key {key} exists, skipping upload') if canvas_sync_job_id: update_canvas_sync_status(canvas_sync_job_id, key, 'duplicate') return False else: app.logger.info(f'Key {key} does not exist, starting upload') try: def update_streaming_status(headers): update_canvas_sync_status( canvas_sync_job_id, key, 'streaming', source_size=headers.get('Content-Length')) response = s3.upload_from_url( url, key, on_stream_opened=update_streaming_status) if response and canvas_sync_job_id: destination_size = response.get('ContentLength') update_canvas_sync_status( canvas_sync_job_id, key, 'complete', destination_size=destination_size) create_canvas_snapshot(key, size=destination_size) return True except (ClientError, ConnectionError, ValueError) as e: if canvas_sync_job_id: update_canvas_sync_status(canvas_sync_job_id, key, 'error', details=str(e)) return False