def test_date_unit_year_bad(self): table = pa.table( [pa.array([date(1900, 4, 1)])], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]), ) with self.assertRaises(DateValueHasWrongUnit): read_columns(table)
def test_date_metadata_invalid_unit(self): table = pa.table( [pa.array([date(2021, 4, 4)])], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"days"})]), ) with self.assertRaises(FieldMetadataNotAllowed): read_columns(table)
def test_text_metadata_not_none(self): table = pa.table( [pa.array(["x"])], pa.schema( [pa.field("A", pa.string(), metadata={b"unit": b"year"})]), ) with self.assertRaises(FieldMetadataNotAllowed): read_columns(table)
def test_timestamp_metadata_non_null(self): table = pa.table( [pa.array([123123123], pa.timestamp("ns"))], pa.schema( [pa.field("A", pa.timestamp("ns"), metadata={b"foo": b"bar"})]), ) with self.assertRaises(FieldMetadataNotAllowed): read_columns(table)
def test_number_metadata_format_invalid_utf8(self): table = pa.table( [pa.array([123])], pa.schema([ pa.field("A", pa.int64(), metadata={b"format": b"\xe2{:,.2f}"}) ]), ) with self.assertRaises(InvalidNumberFormat): read_columns(table)
def test_duplicate_column_names(self): table = pa.table( [pa.array(["x"]), pa.array(["x"])], pa.schema([pa.field("A", pa.string()), pa.field("A", pa.string())]), ) with self.assertRaisesRegex( DuplicateColumnName, "Table has two columns named 'A': column 0 and column 1", ): read_columns(table)
def test_date_metadata_too_many_keys(self): table = pa.table( [pa.array([date(2021, 4, 4)])], pa.schema([ pa.field("A", pa.date32(), metadata={ b"unit": b"day", b"foo": b"bar" }) ]), ) with self.assertRaises(FieldMetadataNotAllowed): read_columns(table)
def test_number_metadata_too_many_keys(self): table = pa.table( [pa.array([123])], pa.schema([ pa.field("A", pa.int64(), metadata={ b"format": b"{:,}", b"foo": b"bar" }) ]), ) with self.assertRaises(FieldMetadataNotAllowed): read_columns(table)
def test_date_unit_day_ok(self): table = pa.table( [pa.array([date(2021, 4, 4)])], pa.schema([pa.field("A", pa.date32(), metadata={b"unit": b"day"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="day"))])
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def test_date_unit_month_ok(self): table = pa.table( [pa.array([date(1200, 12, 1), date(3199, 2, 1), None])], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"month"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="month"))])
def test_number_metadata_utf8_format(self): table = pa.table( [pa.array([123])], pa.schema([ pa.field( "A", pa.int64(), metadata={b"format": "€{:,.2f}".encode("utf-8")}, ) ]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Number(format="€{:,.2f}"))])
def test_date_unit_year_ok(self): table = pa.table( [ pa.array( [date(1900, 1, 1), date(1, 1, 1), date(9999, 1, 1), None]) ], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="year"))])
def write_to_rendercache( workflow: Workflow, step: Step, delta_id: int, table: pa.Table, errors: List[RenderError] = [], json: Dict[str, Any] = {}, ) -> None: with arrow_table_context(table) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=read_columns(table, full=False), errors=errors, json=json, ) # use the caller-provided delta ID: no assertion old_last_relevant_delta_id = step.last_relevant_delta_id step.last_relevant_delta_id = delta_id try: cache_render_result(workflow, step, delta_id, result) finally: step.last_relevant_delta_id = old_last_relevant_delta_id
def test_timestamp_tz_non_null(self): table = pa.table( {"A": pa.array([12312312314512], pa.timestamp("ns", tz="utc"))}) with self.assertRaisesRegex(TimestampTimezoneNotAllowed, "Workbench does not support time zones"): read_columns(table)
def test_table_too_many_record_batches(self): table = pa.table( {"A": pa.chunked_array([pa.array(["x"]), pa.array(["y"])])}) with self.assertRaises(TableHasTooManyRecordBatches): read_columns(table)
def test_table_has_metadata(self): table = pa.table({"A": ["x"]}).replace_schema_metadata({}) # non-null with self.assertRaises(TableSchemaHasMetadata): read_columns(table)
def test_unknown_column_type(self): table = pa.table({"A": pa.array([1231231], pa.time64("ns"))}) with self.assertRaises(WrongColumnType): read_columns(table)
def test_timestamp_unit_not_ns(self): table = pa.table({"A": pa.array([12312312314512], pa.timestamp("us"))}) with self.assertRaisesRegex(TimestampUnitNotAllowed, "Workbench only supports 'ns'"): read_columns(table)
def test_timestamp_ok(self): table = pa.table({"A": pa.array([12312312314512], pa.timestamp("ns"))}) self.assertEqual(read_columns(table), [Column("A", ColumnType.Timestamp())])
def test_number_metadata_none(self): table = pa.table({"A": pa.array([123123123])}) with self.assertRaises(FieldMetadataNotAllowed): read_columns(table)
def test_text_dictionary_ok(self): self.assertEqual( read_columns(pa.table({"A": pa.array(["x"]).dictionary_encode()}), ), [Column("A", ColumnType.Text())], )
def test_text_ok(self): self.assertEqual(read_columns(pa.table({"A": ["x"]})), [Column("A", ColumnType.Text())])