def downloaded_parquet_file(crr: CachedRenderResult, dir=None) -> ContextManager[Path]: """Context manager to download and yield `path`, a hopefully-Parquet file. This is cheaper than open_cached_render_result() because it does not parse the file. Use this function when you suspect you won't need the table data. Raise CorruptCacheError if the cached data is missing. Usage: try: with rendercache.downloaded_parquet_file(crr) as path: # do something with `path`, a `pathlib.Path` except rendercache.CorruptCacheError: # file does not exist.... """ with contextlib.ExitStack() as ctx: try: path = ctx.enter_context( s3.temporarily_download(BUCKET, crr_parquet_key(crr), dir=dir) ) except FileNotFoundError: raise CorruptCacheError yield path
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code=( "import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) step = workflow.tabs.first().steps.create( order=0, slug="step-1", module_id_name="mod" ) cjwstate.modules.init_module_system() now = datetime.datetime.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now) ) step.refresh_from_db() so = step.stored_objects.get(stored_at=step.stored_data_version) with s3.temporarily_download(s3.StoredObjectsBucket, so.key) as parquet_path: table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False) assert_arrow_table_equals(table, {"A": [1]}) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) step = workflow.tabs.first().steps.create(order=0, slug="step-1", module_id_name="mod") cjwstate.modules.init_module_system() now = datetime.datetime.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now)) step.refresh_from_db() so = step.stored_objects.get(stored_at=step.stored_data_version) with s3.temporarily_download(s3.StoredObjectsBucket, so.key) as parquet_path: # fetch results are stored without a schema. Let's hard-code a # schema simply so we can test that the table data is the same. table = read_parquet_as_arrow(parquet_path, [Column("A", ColumnType.Number())]) assert_arrow_table_equals(table, make_table(make_column("A", [1]))) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def downloaded_file(stored_object: StoredObject, dir=None) -> ContextManager[Path]: """Context manager to download and yield `path`, the StoredObject's file. Raise FileNotFoundError if the object is missing. Usage: try: with storedobjects.downloaded_file(stored_object) as path: # do something with `path`, a `pathlib.Path` except FileNotFoundError: # file does not exist.... """ if stored_object.size == 0: # Some stored objects with size=0 do not have key. These are valid: # they represent empty files. return tempfile_context(prefix="storedobjects-empty-file", dir=dir) else: # raises FileNotFoundError return s3.temporarily_download(s3.StoredObjectsBucket, stored_object.key, dir=dir)
def test_file_not_found(self): with self.assertRaises(FileNotFoundError): with s3.temporarily_download(Bucket, Key) as _: pass
def test_allows_reading_file(self): _put(b"1234") with s3.temporarily_download(Bucket, Key) as path: self.assertEqual(path.read_bytes(), b"1234")