def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Datetime()), Column("C", ColumnType.Text()), ] result = RenderResult( arrow_table({ "A": [1], "B": [datetime.datetime.now()], "C": ["x"] }, columns=columns)) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) # Delete from disk entirely, to prove we did not read. minio.remove(BUCKET, crr_parquet_key(self.wf_module.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. minio.remove(so.bucket, so.key) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def _delete_from_s3_pre_delete(sender, instance, **kwargs): """ Delete file from S3, pre-delete. Why pre-delete and not post-delete? Because our user expects the file to be _gone_, completely, forever -- that's what "delete" means to the user. If deletion fails, we need the link to remain in our database -- that's how the user will know it isn't deleted. """ if instance.bucket and instance.key: minio.remove(instance.bucket, instance.key)
def delete_upload_stored_objects(apps, _): from cjwstate import minio StoredObject = apps.get_model("server", "StoredObject") for obj in StoredObject.objects.filter(wf_module__module_id_name="upload").all(): try: minio.remove(obj.bucket, obj.key) except FileNotFoundError: # We're resuming, or the file never existed anyway. (We never # designed for errors, and we changed error behavior over time; # it's possible some uploads never had data.) pass obj.delete()
def delete_s3_data(self): """ Delete all data from S3 that is part of this upload. Call this within a Workflow.cooperative_lock(). This always leaves S3 and the database in a consistent state. """ key = self.get_upload_key() minio.abort_multipart_uploads_by_prefix(self.Bucket, key) minio.remove(self.Bucket, key) if not self.wf_module.uploaded_files.filter(uuid=str(self.id)).count(): # If there's no UploadedFile even though we copied this file to where the # UploadedFile _should_ point, then we've leaked that copy. Delete. See # "tricky leak here" in convert_to_uploaded_file(). final_key_prefix = self.wf_module.uploaded_file_prefix + str(self.id) # no ".xlsx"-type suffix minio.remove_by_prefix(minio.UserFilesBucket, final_key_prefix)
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. minio.remove(minio.StoredObjectsBucket, so.key) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() module_zipfile = create_module_zipfile( "x", python_code=textwrap.dedent(""" import pandas as pd def render(table, params, *, fetch_result, **kwargs): assert fetch_result is None return pd.DataFrame() """), ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def move_uploaded_file(workflow, wf_module, uploaded_file): """ Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext. This helps delete leaked files and find problem files. """ from cjwstate import minio bucket = uploaded_file.bucket old_key = uploaded_file.key if "/" in old_key: return new_key = f"wf-{workflow.id}/wfm-{wf_module.id}/{old_key}" logger.info(f"Move %s/%s to %s/%s", bucket, old_key, bucket, new_key) try: minio.copy(bucket, new_key, f"{bucket}/{old_key}") minio.remove(bucket, old_key) except minio.error.NoSuchKey: # old_key is missing. Two possibilities: # # 1. We're re-running this script after it failed once with # atomic=True (which used to be set, by accident); the move already # succeeded but the DB doesn't know it. In that case, continue # because this error actually means, "all is well." # 2. The file didn't exist to begin with. In that case, write a blank # file in its stead. That way the user will remark, "hey, Workbench # ate my file!" instead of undefined behavior (which is worse). # https://www.pivotaltracker.com/story/show/163336822 if minio.exists(bucket, new_key): pass # "all is well" else: # write an empty file minio.put_bytes(bucket, new_key, b"") uploaded_file.size = 0 uploaded_file.save(update_fields=["size"]) uploaded_file.key = new_key uploaded_file.save(update_fields=["key"])
def auto_delete_file_on_delete(sender, instance, **kwargs): # Delete S3 data when UploadedFile is deleted minio.remove(instance.bucket, instance.key)
def _clear() -> None: try: minio.remove(Bucket, Key) except minio.error.NoSuchKey: pass