def test_delete_remove_leaked_stored_objects_and_uploaded_files(self): workflow = Workflow.create_and_init() # If the user deletes a workflow, all data associated with that # workflow should disappear. Postgres handles DB objects; but Django's # ORM doesn't do a great job with StoredObjects and UploadedFiles. # # This test isn't about minutae. It's just: if the user deletes a # Workflow, make sure all data gets deleted. # # TODO fix all other bugs that leak data. wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="x" ) # Add StoredObject ... and leak it wf_module.store_fetched_table(pd.DataFrame({"A": [1, 2]})) stored_object_key = wf_module.stored_objects.first().key wf_module.stored_objects.all()._raw_delete("default") # skip S3-delete # Add UploadedFile, missing a DB entry. (Even if we fix all bugs that # leak an S3 object after deleting a DB entry [and 2019-06-03 there are # still more] we'll still need to handle missing DB entries from legacy # code.) uploaded_file_key = f"{wf_module.uploaded_file_prefix}{uuid.uuid4()}.csv" minio.put_bytes(minio.UserFilesBucket, uploaded_file_key, b"A\nb") workflow.delete() self.assertFalse(minio.exists(minio.StoredObjectsBucket, stored_object_key)) self.assertFalse(minio.exists(minio.UserFilesBucket, uploaded_file_key))
def test_abort_upload(self): user = User.objects.create(username='******', email='*****@*****.**') workflow = Workflow.create_and_init(owner=user) uuid = str(uuidgen.uuid4()) key = f'wf-123/wfm-234/{uuid}.csv' wf_module = workflow.tabs.first().wf_modules.create( order=0, module_id_name='x', inprogress_file_upload_id=None, inprogress_file_upload_key=key, inprogress_file_upload_last_accessed_at=timezone.now(), ) # let's pretend the user has uploaded at least partial data. minio.put_bytes( minio.UserFilesBucket, key, b'1234567', ContentDisposition="attachment; filename*=UTF-8''file.csv", ) response = self.run_handler(abort_upload, user=user, workflow=workflow, wfModuleId=wf_module.id, key=key) self.assertResponse(response, data=None) wf_module.refresh_from_db() self.assertIsNone(wf_module.inprogress_file_upload_id) self.assertIsNone(wf_module.inprogress_file_upload_key) self.assertIsNone(wf_module.inprogress_file_upload_last_accessed_at) # Ensure the file is deleted from S3 self.assertFalse(minio.exists(minio.UserFilesBucket, key))
def test_convert_to_uploaded_file_happy_path(self): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1", module_id_name="x") ipu = wf_module.in_progress_uploads.create() minio.put_bytes(ipu.Bucket, ipu.get_upload_key(), b"1234567") uploaded_file = ipu.convert_to_uploaded_file("test sheet.xlsx") self.assertEqual(uploaded_file.uuid, str(ipu.id)) final_key = wf_module.uploaded_file_prefix + str(ipu.id) + ".xlsx" # New file on S3 has the right bytes and metadata self.assertEqual( minio.get_object_with_data(minio.UserFilesBucket, final_key)["Body"], b"1234567", ) self.assertEqual( minio.client.head_object(Bucket=minio.UserFilesBucket, Key=final_key)["ContentDisposition"], "attachment; filename*=UTF-8''test%20sheet.xlsx", ) # InProgressUpload is completed self.assertEqual(ipu.is_completed, True) ipu.refresh_from_db() self.assertEqual(ipu.is_completed, True) # also on DB # Uploaded file is deleted self.assertFalse( minio.exists(minio.UserFilesBucket, ipu.get_upload_key()))
def test_delete_wfmodule(self): result = ProcessResult(pandas.DataFrame({"a": [1]})) self.wf_module.cache_render_result(self.delta.id, result) parquet_key = self.wf_module.cached_render_result.parquet_key self.wf_module.delete() self.assertFalse(minio.exists(minio.CachedRenderResultsBucket, parquet_key))
def test_delete_deletes_from_s3(self): minio.put_bytes(minio.StoredObjectsBucket, "test.dat", b"abcd") workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1") so = wf_module.stored_objects.create( size=4, bucket=minio.StoredObjectsBucket, key="test.dat", hash="123" ) so.delete() self.assertFalse(minio.exists(minio.StoredObjectsBucket, "test.dat"))
def test_delete_wf_module_deletes_from_s3(self): minio.put_bytes(minio.StoredObjectsBucket, 'test.dat', b'abcd') workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) wf_module = tab.wf_modules.create(order=0) wf_module.stored_objects.create(size=4, bucket=minio.StoredObjectsBucket, key='test.dat', hash='123') wf_module.delete() self.assertFalse(minio.exists(minio.StoredObjectsBucket, 'test.dat'))
def test_delete_s3_data_leaked_file(self): # Delete a file with our UUID but without an UploadedFile. workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1", module_id_name="x") ipu = wf_module.in_progress_uploads.create() key = wf_module.uploaded_file_prefix + str(ipu.id) + ".xlsx" minio.put_bytes(minio.UserFilesBucket, key, b"1234567") ipu.delete_s3_data() self.assertFalse(minio.exists(minio.UserFilesBucket, key))
def test_set_to_empty(self): result = ProcessResult(pandas.DataFrame({"a": [1]})) self.wf_module.cache_render_result(self.delta.id, result) parquet_key = self.wf_module.cached_render_result.parquet_key db_wf_module = WfModule.objects.get(id=self.wf_module.id) db_wf_module.clear_cached_render_result() self.assertIsNone(db_wf_module.cached_render_result) db_wf_module.refresh_from_db() self.assertIsNone(db_wf_module.cached_render_result) self.assertFalse(minio.exists(minio.CachedRenderResultsBucket, parquet_key))
def test_finish_upload_happy_path(self, send_delta): user = User.objects.create(username="******", email="*****@*****.**") workflow = Workflow.create_and_init(owner=user) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="x" ) in_progress_upload = wf_module.in_progress_uploads.create( id="147a9f5d-5b3e-41c3-a968-a84a5a9d587f" ) key = in_progress_upload.get_upload_key() minio.put_bytes(in_progress_upload.Bucket, key, b"1234567") send_delta.side_effect = async_noop response = self.run_handler( finish_upload, user=user, workflow=workflow, wfModuleId=wf_module.id, key=key, filename="test sheet.csv", ) self.assertResponse( response, data={"uuid": "147a9f5d-5b3e-41c3-a968-a84a5a9d587f"} ) # The uploaded file is deleted self.assertFalse(minio.exists(in_progress_upload.Bucket, key)) # A new upload is created uploaded_file = wf_module.uploaded_files.first() self.assertEqual(uploaded_file.name, "test sheet.csv") self.assertEqual(uploaded_file.size, 7) self.assertEqual(uploaded_file.uuid, "147a9f5d-5b3e-41c3-a968-a84a5a9d587f") self.assertEqual(uploaded_file.bucket, in_progress_upload.Bucket) final_key = f"wf-{workflow.id}/wfm-{wf_module.id}/147a9f5d-5b3e-41c3-a968-a84a5a9d587f.csv" self.assertEqual(uploaded_file.key, final_key) # The file has the right bytes and metadata self.assertEqual( minio.get_object_with_data(minio.UserFilesBucket, final_key)["Body"], b"1234567", ) self.assertEqual( minio.client.head_object(Bucket=minio.UserFilesBucket, Key=final_key)[ "ContentDisposition" ], "attachment; filename*=UTF-8''test%20sheet.csv", ) # wf_module is updated send_delta.assert_called()
def test_abort_upload_happy_path_after_complete(self): user = User.objects.create(username="******", email="*****@*****.**") workflow = Workflow.create_and_init(owner=user) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="x" ) in_progress_upload = wf_module.in_progress_uploads.create( id="147a9f5d-5b3e-41c3-a968-a84a5a9d587f" ) key = in_progress_upload.get_upload_key() minio.put_bytes(in_progress_upload.Bucket, key, b"1234567") response = self.run_handler( abort_upload, user=user, workflow=workflow, wfModuleId=wf_module.id, key=key ) self.assertResponse(response, data=None) wf_module.refresh_from_db() self.assertFalse(minio.exists(in_progress_upload.Bucket, key))
def test_delete_s3_data_ignore_non_leaked_file(self): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1", module_id_name="x") ipu = wf_module.in_progress_uploads.create() key = wf_module.uploaded_file_prefix + str(ipu.id) + ".xlsx" minio.put_bytes(minio.UserFilesBucket, key, b"1234567") wf_module.uploaded_files.create( name="text.xlsx", size=7, uuid=str(self.id), bucket=minio.UserFilesBucket, key=key, ) ipu.delete_s3_data() self.assertFalse(minio.exists(minio.UserFilesBucket, key))
def test_abort(self): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123") upload = wf_module.in_progress_uploads.create() key = upload.get_upload_key() minio.put_bytes(upload.Bucket, key, b"1234567") response = self.client.delete( f"/api/v1/workflows/{workflow.id}/steps/step-123/uploads/{upload.id}", HTTP_AUTHORIZATION="Bearer abc123", ) self.assertEqual(response.status_code, 200) self.assertEqual(json.loads(response.content), {}) self.assertFalse(minio.exists(upload.Bucket, key)) # file was deleted upload.refresh_from_db() self.assertTrue(upload.is_completed)
def test_complete_happy_path(self, queue_render, send_delta): send_delta.return_value = async_noop() queue_render.return_value = async_noop() _init_module("x") workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123") upload = wf_module.in_progress_uploads.create() uuid = str(upload.id) key = upload.get_upload_key() minio.put_bytes(upload.Bucket, key, b"1234567") response = self.client.post( f"/api/v1/workflows/{workflow.id}/steps/step-123/uploads/{upload.id}", {"filename": "test.csv"}, content_type="application/json", HTTP_AUTHORIZATION="Bearer abc123", ) self.assertEqual(response.status_code, 200) # Upload and its S3 data were deleted self.assertFalse(minio.exists(upload.Bucket, key)) upload.refresh_from_db() self.assertTrue(upload.is_completed) # Final upload was created uploaded_file = wf_module.uploaded_files.first() self.assertEqual(uploaded_file.key, f"wf-{workflow.id}/wfm-{wf_module.id}/{uuid}.csv") self.assertEqual( minio.get_object_with_data(minio.UserFilesBucket, uploaded_file.key)["Body"], b"1234567", ) self.assertEqual(uploaded_file.name, "test.csv") # Return value includes uuid data = json.loads(response.content) self.assertEqual(data["uuid"], uuid) self.assertEqual(data["name"], "test.csv") self.assertEqual(data["size"], 7) # Send deltas send_delta.assert_called() queue_render.assert_called()
def move_uploaded_file(workflow, wf_module, uploaded_file): """ Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext. This helps delete leaked files and find problem files. """ from server import minio bucket = uploaded_file.bucket old_key = uploaded_file.key if '/' in old_key: return new_key = f'wf-{workflow.id}/wfm-{wf_module.id}/{old_key}' logger.info(f'Move %s/%s to %s/%s', bucket, old_key, bucket, new_key) try: minio.copy(bucket, new_key, f'{bucket}/{old_key}') minio.remove(bucket, old_key) except minio.error.NoSuchKey: # old_key is missing. Two possibilities: # # 1. We're re-running this script after it failed once with # atomic=True (which used to be set, by accident); the move already # succeeded but the DB doesn't know it. In that case, continue # because this error actually means, "all is well." # 2. The file didn't exist to begin with. In that case, write a blank # file in its stead. That way the user will remark, "hey, Workbench # ate my file!" instead of undefined behavior (which is worse). # https://www.pivotaltracker.com/story/show/163336822 if minio.exists(bucket, new_key): pass # "all is well" else: # write an empty file minio.put_bytes(bucket, new_key, b'') uploaded_file.size = 0 uploaded_file.save(update_fields=['size']) uploaded_file.key = new_key uploaded_file.save(update_fields=['key'])