def fetch_arrow( params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result: Optional[types.FetchResult], input_table_parquet_path: Optional[Path], output_path: Path, ) -> types.FetchResult: """Render using `cjwkernel.types` data types. The result will be encoded as a Parquet file. Module authors are encouraged to replace this function, because the `fetch()` signature deals in dataframes instead of in raw data. """ pandas_result: Union[ptypes.ProcessResult, types.FetchResult] = fetch_pandas( params=__arrow_param_to_pandas_param(params), secrets=secrets, last_fetch_result=last_fetch_result, input_table_parquet_path=input_table_parquet_path, output_path=output_path, ) if isinstance(pandas_result, ptypes.ProcessResult): pandas_result.truncate_in_place_if_too_big() # ProcessResult => FetchResult isn't a thing; but we can hack it using # ProcessResult => RenderResult => FetchResult. with tempfile_context(suffix=".arrow") as arrow_path: hacky_result = pandas_result.to_arrow(arrow_path) if hacky_result.table.path: cjwparquet.write(output_path, hacky_result.table.table) else: output_path.write_bytes(b"") return types.FetchResult(output_path, hacky_result.errors) else: # it's already a types.FetchResult return pandas_result
def test_parquet_same_data_different_bytes(self): cjwparquet.write(self.old_path, make_table(make_column("A", ["a"]))) cjwparquet.write(self.new_path, make_table(make_column("A", ["a"], dictionary=True))) self.assertTrue( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_slice_zero_rows(self): with tempfile_context() as path: # ensure at least 1 row group cjwparquet.write( path, pa.table({ "A": pa.array([], pa.string()), "B": pa.DictionaryArray.from_arrays(pa.array([], pa.int32()), pa.array([], pa.string())), "C": pa.array([], pa.timestamp("ns")), "D": pa.array([], pa.float64()), }), ) self.assertEqual( cjwparquet.read_slice_as_text(path, format="csv", only_columns=range(4), only_rows=range(0)), "A,B,C,D", ) self.assertEqual( cjwparquet.read_slice_as_text(path, format="json", only_columns=range(4), only_rows=range(0)), "[]", )
def _test_read_write_table(self, table, expected=None): if isinstance(table, dict): table = pa.table(table) if expected is None: expected = table with tempfile_context(prefix="parquet-text") as temp_path: cjwparquet.write(temp_path, table) result = cjwparquet.read(temp_path) assert_arrow_table_equals(result, table)
def test_parquet_same_data_different_bytes(self): cjwparquet.write(self.old_path, arrow_table({"A": ["a"]}).table) cjwparquet.write( self.new_path, arrow_table({ "A": pyarrow.array(["a"]).dictionary_encode() }).table, ) self.assertTrue( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def cache_render_result(workflow: Workflow, wf_module: WfModule, delta_id: int, result: RenderResult) -> None: """ Save `result` for later viewing. Raise AssertionError if `delta_id` is not what we expect. Since this alters data, be sure to call it within a lock: with workflow.cooperative_lock(): wf_module.refresh_from_db() # may change delta_id cache_render_result(workflow, wf_module, delta_id, result) """ assert delta_id == wf_module.last_relevant_delta_id assert result is not None json_bytes = json_encode(result.json).encode("utf-8") if not result.table.metadata.columns: if result.errors: status = "error" else: status = "unreachable" else: status = "ok" wf_module.cached_render_result_delta_id = delta_id wf_module.cached_render_result_errors = result.errors wf_module.cached_render_result_status = status wf_module.cached_render_result_json = json_bytes wf_module.cached_render_result_columns = result.table.metadata.columns wf_module.cached_render_result_nrows = result.table.metadata.n_rows # Now we get to the part where things can end up inconsistent. Try to # err on the side of not-caching when that happens. delete_parquet_files_for_wf_module( workflow.id, wf_module.id) # makes old cache inconsistent wf_module.save( update_fields=WF_MODULE_FIELDS) # makes new cache inconsistent if result.table.metadata.columns: # only write non-zero-column tables with tempfile_context() as parquet_path: cjwparquet.write(parquet_path, result.table.table) minio.fput_file(BUCKET, parquet_key(workflow.id, wf_module.id, delta_id), parquet_path) # makes new cache consistent
def parquet_file(d): arrow_table = pyarrow.table(d) with tempfile.NamedTemporaryFile(suffix=".parquet") as tf: path = Path(tf.name) cjwparquet.write(path, arrow_table) yield path
def call_fetch(fetch: Callable, request: ttypes.FetchRequest) -> ttypes.FetchResult: """Call `fetch()` and validate the result. Module code may contain errors. This function and `fetch()` should strive to raise developer-friendly errors in the case of bugs -- including unexpected input. """ # thrift => pandas basedir = Path(request.basedir) params: Dict[str, Any] = thrift_json_object_to_pydict(request.params) output_path = basedir / request.output_filename spec = inspect.getfullargspec(fetch) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "secrets" in kwonlyargs: kwargs["secrets"] = thrift_json_object_to_pydict(request.secrets) if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "get_input_dataframe" in kwonlyargs: async def get_input_dataframe(): if request.input_table_parquet_filename is None: return None else: return _parquet_to_pandas(basedir / request.input_table_parquet_filename) kwargs["get_input_dataframe"] = get_input_dataframe if varkw or "output_path" in kwonlyargs: kwargs["output_path"] = output_path result = fetch(params, **kwargs) if asyncio.iscoroutine(result): result = asyncio.run(result) if isinstance(result, tuple) and len(result) == 2 and isinstance( result[0], Path): errors = ptypes.coerce_RenderError_list(result[1]) elif isinstance(result, Path): errors = [] elif isinstance(result, list): errors = ptypes.coerce_RenderError_list(result) else: pandas_result = ptypes.ProcessResult.coerce(result) pandas_result.truncate_in_place_if_too_big() # ProcessResult => FetchResult isn't a thing; but we can hack it using # ProcessResult => RenderResult => FetchResult. with tempfile_context(suffix=".arrow") as arrow_path: if pandas_result.columns: hacky_result = pandas_result.to_arrow(arrow_path) table = load_trusted_arrow_file(arrow_path) cjwparquet.write(output_path, table) errors = hacky_result.errors else: output_path.write_bytes(b"") errors = pandas_result.errors return ttypes.FetchResult( filename=request.output_filename, errors=[arrow_render_error_to_thrift(e) for e in errors], )
def test_parquet_vs_non_parquet(self): cjwparquet.write(self.old_path, arrow_table({"A": ["a"]}).table) self.new_path.write_bytes(b"12345") self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_parquet_different(self): cjwparquet.write(self.old_path, arrow_table({"A": [1]}).table) cjwparquet.write(self.new_path, arrow_table({"A": [2]}).table) self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_parquet_vs_non_parquet(self): cjwparquet.write(self.old_path, make_table(make_column("A", ["a"]))) self.new_path.write_bytes(b"12345") self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_parquet_different(self): cjwparquet.write(self.old_path, make_table(make_column("A", [1]))) cjwparquet.write(self.new_path, make_table(make_column("A", [2]))) self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))