def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_render_with_input_columns(self): def render(table, params, *, input_columns): self.assertEqual( input_columns, { "A": ptypes.RenderColumn("A", "text", None), "B": ptypes.RenderColumn("B", "number", "{:,.3f}"), "C": ptypes.RenderColumn("C", "timestamp", None), }, ) with arrow_table_context( { "A": ["x"], "B": [1], "C": pa.array([datetime.now()], pa.timestamp("ns")) }, columns=[ Column("A", ColumnType.Text()), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Timestamp()), ], dir=self.basedir, ) as arrow_table: self._test_render(render, arrow_table=arrow_table)
def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg # # TODO nix this! The only module that uses it is `converttotext`. self.assertEqual( columns, [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Timestamp()), Column("E", ColumnType.Timestamp()), Column("F", ColumnType.Timestamp()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), Column("J", ColumnType.Date(unit="day")), Column("K", ColumnType.Date(unit="week")), Column("L", ColumnType.Text()), ], ) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], "J": pa.array([date(2021, 4, 1)]), "K": pa.array([date(2021, 4, 12)]), "L": pa.array([date(2021, 4, 1)]), } ) schema = table.schema.set( table.schema.get_field_index("J"), pa.field("J", pa.date32(), metadata={"unit": "month"}), ) with pa.ipc.RecordBatchFileWriter(output_path, schema) as writer: writer.write_table(pa.table(table.columns, schema=schema)) return []
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column: if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer( column.type): column_type = ColumnType.Number("{:,}") elif pyarrow.types.is_timestamp(column.type): column_type = ColumnType.Timestamp() elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary( column.type): column_type = ColumnType.Text() else: raise RuntimeError("Unknown column type %r" % column.type) return Column(name, column_type)
def test_read_cached_render_result_slice_as_text_timestamp(self): with arrow_table_context( make_column("A", [2134213412341232967, None], pa.timestamp("ns")) ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Timestamp())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result self.assertEqual( read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)), "A\n2037-08-18T13:03:32.341232967Z\n", )
def test_clean_multicolumn_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # ... but for now: prompt for a Quick Fix. input_shape = TableMetadata( 3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), ], ) with self.assertRaises(PromptingError) as cm: schema = ParamDType.Multicolumn(column_types=frozenset({"text"})) clean_value(schema, "A,B", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"})) ], )
def Timestamp(name: str) -> Column: return Column(name, ColumnType.Timestamp())
def test_timestamp_ok(self): table = pa.table({"A": pa.array([12312312314512], pa.timestamp("ns"))}) self.assertEqual(read_columns(table), [Column("A", ColumnType.Timestamp())])
def test_render_arrow_table_infer_output_column_formats_from_input(self): input_columns = [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Timestamp()), Column("E", ColumnType.Timestamp()), Column("F", ColumnType.Timestamp()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), ] # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg self.assertEqual(columns, input_columns) table = pa.table({ "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], }) with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer: writer.write_table(table) return [] with arrow_table_context( { "A": [1], "B": [1], "C": [1], "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "G": ["a"], "H": ["a"], "I": ["a"], }, columns=input_columns, dir=self.basedir, ) as arrow_table: result = self._test_render(render, arrow_table=arrow_table) self.assertEqual( result.table.metadata.columns, [ Column("A", ColumnType.Number("{:,.3f}")), # recalled Column("B", ColumnType.Timestamp()), # inferred Column("C", ColumnType.Text()), # inferred Column("D", ColumnType.Number("{:,}")), # inferred Column("E", ColumnType.Timestamp()), # recalled Column("F", ColumnType.Text()), # inferred Column("G", ColumnType.Number("{:,}")), # inferred Column("H", ColumnType.Timestamp()), # inferred Column("I", ColumnType.Text()), # recalled ], )
def TIMESTAMP(name: str): return Column(name, ColumnType.Timestamp())