def render_arrow( table, params, tab_name, fetch_result: Optional[FetchResult], output_path: Path ) -> RenderResult: # Must perform header operation here in the event the header checkbox # state changes if fetch_result is None: # empty table return RenderResult(ArrowTable()) elif fetch_result.path is not None and parquet.file_has_parquet_magic_number( fetch_result.path ): # Deprecated files: we used to parse in fetch() and store the result # as Parquet. Now we've lost the original file data, and we need to # support our oldest users. # # In this deprecated format, parse errors were written as # fetch_result.errors. return _render_deprecated_parquet( fetch_result.path, fetch_result.errors, output_path, params ) elif fetch_result.errors: # We've never stored errors+data. If there are errors, assume # there's no data. return RenderResult(ArrowTable(), fetch_result.errors) else: assert not fetch_result.errors # we've never stored errors+data. return _render_file(fetch_result.path, params, output_path)
def render_arrow( table: types.ArrowTable, params: Dict[str, Any], tab_name: str, fetch_result: Optional[types.FetchResult], output_path: Path, ) -> types.RenderResult: """ Render using `cjwkernel.types` data types. If outputting Arrow data, write to `output_path`. Module authors are encouraged to replace this function, because Arrow tables are simpler and more memory-efficient than Pandas tables. This is the ideal signature for a "rename columns" module, for instance: Arrow can pass data through without consuming excessive RAM. This does not validate the render_pandas() return value. """ pandas_table = __arrow_to_pandas(table) pandas_input_tabs = { to.tab.slug: __arrow_tab_output_to_pandas(to) for to in _find_tab_outputs(params) } if fetch_result is not None: if fetch_result.path.stat( ).st_size == 0 or parquet.file_has_parquet_magic_number( fetch_result.path): fetched_table = __parquet_to_pandas(fetch_result.path) pandas_fetch_result = ptypes.ProcessResult( fetched_table, [ ptypes.ProcessResultError.from_arrow(error) for error in fetch_result.errors ], ) else: pandas_fetch_result = fetch_result else: pandas_fetch_result = None pandas_result: ptypes.ProcessResult = render_pandas( input_table=pandas_table, input_table_shape=ptypes.TableShape.from_arrow(table.metadata), params=_arrow_param_to_pandas_param(params), tab_name=tab_name, input_tabs=pandas_input_tabs, fetch_result=pandas_fetch_result, ) return pandas_result.to_arrow(output_path)
def test_empty_parquet_file(self): with parquet_file({}) as path: self.assertTrue(parquet.file_has_parquet_magic_number(path))
def test_good_magic_numbers_but_too_short_to_be_parquet(self): # Parquet has PAR1 at the beginning and end. But the file "PAR1" on its # own is not a Parquet file. with tempfile_context() as path: path.write_bytes(b"PAR1") self.assertFalse(parquet.file_has_parquet_magic_number(path))
def test_very_short_file(self): with tempfile_context() as path: path.write_bytes(b"PAR") self.assertFalse(parquet.file_has_parquet_magic_number(path))
def test_empty_file(self): with tempfile_context() as path: self.assertFalse(parquet.file_has_parquet_magic_number(path))
def test_parquet_file_has_magic_numbers(self): with parquet_file({"A": [1]}) as path: self.assertTrue(parquet.file_has_parquet_magic_number(path))