def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Datetime()), Column("C", ColumnType.Text()), ] result = RenderResult( arrow_table({ "A": [1], "B": [datetime.datetime.now()], "C": ["x"] }, columns=columns)) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) # Delete from disk entirely, to prove we did not read. minio.remove(BUCKET, crr_parquet_key(self.wf_module.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_invalid_parquet_is_corrupt_cache_error(self): result = RenderResult(arrow_table({"A": [1]})) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) crr = self.wf_module.cached_render_result minio.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): load_cached_render_result(crr, arrow_path)
def test_delete_step(self): result = RenderResult(arrow_table({"A": [1]}), [RenderError(I18nMessage("X", {}, None), [])], {}) cache_render_result(self.workflow, self.step, 1, result) parquet_key = crr_parquet_key(self.step.cached_render_result) self.step.delete() self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_delete_wfmodule(self): result = RenderResult( arrow_table({"A": [1]}), [RenderError(I18nMessage("X", []), [])], {} ) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) parquet_key = crr_parquet_key(self.wf_module.cached_render_result) self.wf_module.delete() self.assertFalse(minio.exists(BUCKET, parquet_key))
def test_clear(self): result = RenderResult(arrow_table({"A": [1]})) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) parquet_key = crr_parquet_key(self.wf_module.cached_render_result) clear_cached_render_result_for_wf_module(self.wf_module) db_wf_module = WfModule.objects.get(id=self.wf_module.id) self.assertIsNone(db_wf_module.cached_render_result) self.assertFalse(minio.exists(BUCKET, parquet_key))
def test_delete_step(self): write_to_rendercache( self.workflow, self.step, 1, table=make_table(make_column("A", [1])), errors=[RenderError(I18nMessage("X", {}, None), [])], json={"foo": "bar"}, ) parquet_key = crr_parquet_key(self.step.cached_render_result) self.step.delete() self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass
def test_clear(self): with arrow_table_context(make_column("A", [1])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) parquet_key = crr_parquet_key(self.step.cached_render_result) clear_cached_render_result_for_step(self.step) db_step = Step.objects.get(id=self.step.id) self.assertIsNone(db_step.cached_render_result) self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_cache_render_result(self): with arrow_table_context(make_column("A", [1])) as (table_path, table): result = LoadedRenderResult( path=table_path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[ RenderError( I18nMessage("e1", {"text": "hi"}, None), [ QuickFix( I18nMessage("q1", {"var": 2}, None), QuickFixAction.PrependStep("filter", {"a": "x"}), ) ], ), RenderError(I18nMessage("e2", {}, None), []), ], json={"foo": "bar"}, ) cache_render_result(self.workflow, self.step, 1, result) cached = self.step.cached_render_result self.assertEqual(cached.step_id, self.step.id) self.assertEqual(cached.delta_id, 1) self.assertEqual( crr_parquet_key(cached), f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat", ) # Reading completely freshly from the DB should give the same thing db_step = Step.objects.get(id=self.step.id) from_db = db_step.cached_render_result self.assertEqual(from_db, cached) with open_cached_render_result(from_db) as result2: assert_arrow_table_equals( result2.table, make_table(make_column("A", [1], format="{:,}")) ) self.assertEqual( result2.columns, [Column("A", ColumnType.Number(format="{:,}"))] )
def test_cache_render_result(self): result = RenderResult( arrow_table({"A": [1]}), [ RenderError( I18nMessage("e1", [1, "x"]), [ QuickFix( I18nMessage("q1", []), QuickFixAction.PrependStep("filter", {"a": "x"}), ) ], ), RenderError(I18nMessage("e2", []), []), ], {"foo": "bar"}, ) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) cached = self.wf_module.cached_render_result self.assertEqual(cached.wf_module_id, self.wf_module.id) self.assertEqual(cached.delta_id, self.delta.id) self.assertEqual( crr_parquet_key(cached), f"wf-{self.workflow.id}/wfm-{self.wf_module.id}/delta-{self.delta.id}.dat", ) # Reading completely freshly from the DB should give the same thing db_wf_module = WfModule.objects.get(id=self.wf_module.id) from_db = db_wf_module.cached_render_result self.assertEqual(from_db, cached) with open_cached_render_result(from_db) as result2: assert_render_result_equals(result2, result)
def _duplicate_with_slug_and_delta_id(self, to_tab, slug, last_relevant_delta_id): # Initialize but don't save new_step = WfModule( tab=to_tab, slug=slug, module_id_name=self.module_id_name, fetch_errors=self.fetch_errors, stored_data_version=self.stored_data_version, order=self.order, notes=self.notes, is_collapsed=self.is_collapsed, auto_update_data=False, next_update=None, update_interval=self.update_interval, last_update_check=self.last_update_check, last_relevant_delta_id=last_relevant_delta_id, params=self.params, secrets={}, # DO NOT COPY SECRETS ) # Copy cached render result, if there is one. # # If we duplicate a Workflow mid-render, the cached render result might # not have any useful data. But that's okay: just kick off a new # render. The common case (all-rendered Workflow) will produce a # fully-rendered duplicate Workflow. # # We cannot copy the cached result if the destination Tab has a # different name than this one: tab_name is passed to render(), so even # an exactly-duplicated WfModule can have a different output. cached_result = self.cached_render_result if cached_result is not None and self.tab.name == to_tab.name: # assuming file-copy succeeds, copy cached results. new_step.cached_render_result_delta_id = new_step.last_relevant_delta_id for attr in ("status", "errors", "json", "columns", "nrows"): full_attr = f"cached_render_result_{attr}" setattr(new_step, full_attr, getattr(self, full_attr)) new_step.save() # so there is a new_step.id for parquet_key # Now new_step.cached_render_result will return a # CachedRenderResult, because all the DB values are set. It'll have # a .parquet_key ... but there won't be a file there (because we # never wrote it). from cjwstate.rendercache.io import BUCKET, crr_parquet_key old_parquet_key = crr_parquet_key(cached_result) new_parquet_key = crr_parquet_key(new_step.cached_render_result) try: minio.copy( minio.CachedRenderResultsBucket, new_parquet_key, "%(Bucket)s/%(Key)s" % {"Bucket": BUCKET, "Key": old_parquet_key}, ) except minio.error.NoSuchKey: # DB and filesystem are out of sync. CachedRenderResult handles # such cases gracefully. So `new_result` will behave exactly # like `cached_result`. pass else: new_step.save() # Duplicate the current stored data only, not the history if self.stored_data_version is not None: self.stored_objects.get(stored_at=self.stored_data_version).duplicate( new_step ) # Duplicate the "selected" file, if there is one; otherwise, duplicate # the most-recently-uploaded file. # # We special-case the 'upload' module because it's the only one that # has 'file' params right now. (If that ever changes, we'll want to # change a few things: upload paths should include param name, and this # test will need to check module_zipfile to find the param name of the # file.) if self.module_id_name == "upload": uuid = self.params["file"] uploaded_file = self.uploaded_files.filter(uuid=uuid).first() if uploaded_file is not None: new_key = uploaded_file.key.replace( self.uploaded_file_prefix, new_step.uploaded_file_prefix ) assert new_key != uploaded_file.key # TODO handle file does not exist minio.copy( minio.UserFilesBucket, new_key, f"{minio.UserFilesBucket}/{uploaded_file.key}", ) new_step.uploaded_files.create( created_at=uploaded_file.created_at, name=uploaded_file.name, size=uploaded_file.size, uuid=uploaded_file.uuid, key=new_key, ) return new_step
def _duplicate_with_slug_and_delta_id(self, to_tab, slug, last_relevant_delta_id): # Initialize but don't save new_step = Step( tab=to_tab, slug=slug, module_id_name=self.module_id_name, fetch_errors=self.fetch_errors, stored_data_version=self.stored_data_version, order=self.order, notes=self.notes, is_collapsed=self.is_collapsed, auto_update_data=False, next_update=None, update_interval=self.update_interval, last_update_check=self.last_update_check, last_relevant_delta_id=last_relevant_delta_id, params=self.params, secrets={}, # DO NOT COPY SECRETS ) # Copy cached render result, if there is one. # # If we duplicate a Workflow mid-render, the cached render result might # not have any useful data. But that's okay: just kick off a new # render. The common case (all-rendered Workflow) will produce a # fully-rendered duplicate Workflow. # # We cannot copy the cached result if the destination Tab has a # different name than this one: tab_name is passed to render(), so even # an exactly-duplicated Step can have a different output. cached_result = self.cached_render_result if cached_result is not None and self.tab.name == to_tab.name: # assuming file-copy succeeds, copy cached results. new_step.cached_render_result_delta_id = new_step.last_relevant_delta_id for attr in ("status", "errors", "json", "columns", "nrows"): full_attr = f"cached_render_result_{attr}" setattr(new_step, full_attr, getattr(self, full_attr)) new_step.save() # so there is a new_step.id for parquet_key # Now new_step.cached_render_result will return a # CachedRenderResult, because all the DB values are set. It'll have # a .parquet_key ... but there won't be a file there (because we # never wrote it). from cjwstate.rendercache.io import BUCKET, crr_parquet_key old_parquet_key = crr_parquet_key(cached_result) new_parquet_key = crr_parquet_key(new_step.cached_render_result) try: s3.copy( s3.CachedRenderResultsBucket, new_parquet_key, "%(Bucket)s/%(Key)s" % {"Bucket": BUCKET, "Key": old_parquet_key}, ) except s3.layer.error.NoSuchKey: # DB and filesystem are out of sync. CachedRenderResult handles # such cases gracefully. So `new_result` will behave exactly # like `cached_result`. pass else: new_step.save() # Duplicate the current stored data only, not the history if self.stored_data_version is not None: self.stored_objects.get(stored_at=self.stored_data_version).duplicate( new_step ) # For each "file" param, duplicate the "selected" uploaded_file if there # is one. # # We assume any UUID in `params` that points to an uploaded file _is_ # a file-dtype param. ([adamhooper, 2020-07-14] when the assumption does # not hold, will this cause DB errors? Not sure, but it's not a security # risk.) # # Why not check the param schema? Because we'd need to define behavior # for when the module doesn't exist, or its version is changed, or its # code breaks.... bah! These behaviors don't line up with any user # expectations. Users want to copy the thing they see. for uuid_str in self.params.values(): if not isinstance(uuid_str, str): continue try: UUID(uuid_str) except ValueError: continue uploaded_file = self.uploaded_files.filter(uuid=uuid_str).first() if not uploaded_file: continue new_key = uploaded_file.key.replace( self.uploaded_file_prefix, new_step.uploaded_file_prefix ) assert new_key != uploaded_file.key # TODO handle file does not exist s3.copy( s3.UserFilesBucket, new_key, f"{s3.UserFilesBucket}/{uploaded_file.key}", ) new_step.uploaded_files.create( created_at=uploaded_file.created_at, name=uploaded_file.name, size=uploaded_file.size, uuid=uploaded_file.uuid, key=new_key, ) return new_step