def test_pydict_zero_rows(self): with tempfile_context() as path: # ensure at least 1 row group parquet.write( path, pyarrow.table({ "A": pyarrow.array([], type=pyarrow.string()), "B": pyarrow.DictionaryArray.from_arrays( pyarrow.array([], type=pyarrow.int32()), pyarrow.array([], type=pyarrow.string()), ), "C": pyarrow.array([], type=pyarrow.timestamp("ns")), "D": pyarrow.array([], type=pyarrow.float64()), }), ) self.assertEqual( parquet.read_pydict(path, range(4), range(0)), { "A": [], "B": [], "C": [], "D": [] }, )
def _test_read_write_table(self, table, expected=None): table = arrow_table(table).table if expected is None: expected = table else: expected = arrow_table(expected).table parquet.write(self.temp_path, table) result = parquet.read(self.temp_path) assert_arrow_table_equals(result, table)
def test_parquet_same_data_different_bytes(self): parquet.write(self.old_path, arrow_table({"A": ["a"]}).table) parquet.write( self.new_path, arrow_table({ "A": pyarrow.array(["a"]).dictionary_encode() }).table, ) self.assertTrue( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def cache_render_result(workflow: Workflow, wf_module: WfModule, delta_id: int, result: RenderResult) -> None: """ Save `result` for later viewing. Raise AssertionError if `delta_id` is not what we expect. Since this alters data, be sure to call it within a lock: with workflow.cooperative_lock(): wf_module.refresh_from_db() # may change delta_id cache_render_result(workflow, wf_module, delta_id, result) """ assert delta_id == wf_module.last_relevant_delta_id assert result is not None json_bytes = json_encode(result.json).encode("utf-8") if not result.table.metadata.columns: if result.errors: status = "error" else: status = "unreachable" else: status = "ok" wf_module.cached_render_result_delta_id = delta_id wf_module.cached_render_result_errors = result.errors wf_module.cached_render_result_error = "" # DELETEME wf_module.cached_render_result_quick_fixes = [] # DELETEME wf_module.cached_render_result_status = status wf_module.cached_render_result_json = json_bytes wf_module.cached_render_result_columns = result.table.metadata.columns wf_module.cached_render_result_nrows = result.table.metadata.n_rows # Now we get to the part where things can end up inconsistent. Try to # err on the side of not-caching when that happens. delete_parquet_files_for_wf_module( workflow.id, wf_module.id) # makes old cache inconsistent wf_module.save( update_fields=WF_MODULE_FIELDS) # makes new cache inconsistent if result.table.metadata.columns: # only write non-zero-column tables with tempfile_context() as parquet_path: parquet.write(parquet_path, result.table.table) minio.fput_file(BUCKET, parquet_key(workflow.id, wf_module.id, delta_id), parquet_path) # makes new cache consistent
def test_slice_zero_rows(self): with tempfile_context() as path: # ensure at least 1 row group parquet.write( path, pa.table({ "A": pa.array([], pa.string()), "B": pa.DictionaryArray.from_arrays(pa.array([], pa.int32()), pa.array([], pa.string())), "C": pa.array([], pa.timestamp("ns")), "D": pa.array([], pa.float64()), }), ) self.assertEqual( parquet.read_slice_as_text(path, "csv", range(4), range(0)), "A,B,C,D") self.assertEqual( parquet.read_slice_as_text(path, "json", range(4), range(0)), "[]")
def fetch_arrow( params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[types.FetchResult], input_table_parquet_path: Optional[Path], output_path: Path, ) -> types.FetchResult: """ Render using `cjwkernel.types` data types. The result will be encoded as a Parquet file. Module authors are encouraged to replace this function, because the `fetch()` signature deals in dataframes instead of in raw data. """ pandas_result: Union[ptypes.ProcessResult, types.FetchResult] = fetch_pandas( params=_arrow_param_to_pandas_param(params), secrets=secrets, last_fetch_result=last_fetch_result, input_table_parquet_path=input_table_parquet_path, output_path=output_path, ) if isinstance(pandas_result, ptypes.ProcessResult): pandas_result.truncate_in_place_if_too_big() # ProcessResult => FetchResult isn't a thing; but we can hack it using # ProcessResult => RenderResult => FetchResult. with tempfile_context(suffix=".arrow") as arrow_path: hacky_result = pandas_result.to_arrow(arrow_path) if hacky_result.table.path: parquet.write(output_path, hacky_result.table.table) else: output_path.write_bytes(b"") return types.FetchResult(output_path, hacky_result.errors) else: # it's already a types.FetchResult return pandas_result
def test_parquet_vs_non_parquet(self): parquet.write(self.old_path, arrow_table({"A": ["a"]}).table) self.new_path.write_bytes(b"12345") self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_parquet_different(self): parquet.write(self.old_path, arrow_table({"A": [1]}).table) parquet.write(self.new_path, arrow_table({"A": [2]}).table) self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))