def test_format_float64_array(): assert format_number_array( pa.array( [ 1, -1.1, 2.3, -3.123, math.nan, math.inf, -math.inf, 4_000_000.23123, -4_000_000.414213, None, ], pa.float64(), ), parse_number_format("{:,}"), ).to_pylist() == [ "1", "-1.1", "2.3", "-3.123", None, None, None, "4,000,000.23123", "-4,000,000.414213", None, ]
def _infer_column( series: pd.Series, given_format: Optional[str], try_fallback: Optional[Column] ) -> Column: """ Build a valid `Column` for the given Series, or raise `ValueError`. The logic: determine the `ColumnType` class of `series` (e.g., `ColumnType.Number`) and then try to initialize it with `given_format`. If the format is invalid, raise `ValueError` because the user tried to create something invalid. If `try_fallback` is given and of the correct `ColumnType` class, use `try_fallback`. Otherwise, construct `Column` with default format. """ # Determine ColumnType class, based on pandas/numpy `dtype`. dtype = series.dtype if is_numeric_dtype(dtype): if given_format is not None: parse_number_format(given_format) return Column(series.name, ColumnType.Number(format=given_format)) elif try_fallback is not None and isinstance( try_fallback.type, ColumnType.Number ): return try_fallback else: return Column(series.name, ColumnType.Number(format="{:,}")) elif is_datetime64_dtype(dtype): if given_format is not None: raise ValueError( '"format" not allowed for column "%s" because it is of type "timestamp"' % (series.name,) ) return Column(series.name, ColumnType.Timestamp()) elif pd.PeriodDtype(freq="D") == dtype: if given_format is not None: if given_format not in {"day", "week", "month", "quarter", "year"}: raise ValueError( 'Unit must be "day", "week", "month", "quarter" or "year"; got %r for column "%s"' % (given_format, series.name) ) return Column(series.name, ColumnType.Date(unit=given_format)) elif try_fallback is not None and isinstance( try_fallback.type, ColumnType.Date ): return try_fallback else: return Column(series.name, ColumnType.Date(unit="day")) elif dtype == object or dtype == "category": if given_format is not None: raise ValueError( '"format" not allowed for column "%s" because it is of type "text"' % (series.name,) ) return Column(series.name, ColumnType.Text()) else: raise ValueError(f"Unknown dtype: {dtype}")
def test_format_int8_array(): assert ( format_number_array( pa.array([1, -1, 2, -2, 3, -3, 4, -4, None, None, 6, -6], pa.int8()), parse_number_format("{:d}"), ).to_pylist() == ["1", "-1", "2", "-2", "3", "-3", "4", "-4", None, None, "6", "-6"] )
def test_format_int(): f = parse_number_format("{:,d}") assert f(1) == "1" assert f(1.0) == "1" # must treat float like int assert f(1.6) == "1" # round towards 0 assert f(-1.6) == "-1" # round towards 0 assert f(1_234_567) == "1,234,567" assert f(1125899906842624) == "1,125,899,906,842,624" # int64
def test_format_general(): f = parse_number_format("{}") assert f(1) == "1" assert f(1.0) == "1" # must treat float like int assert f(3.2) == "3.2" assert ( f(234234234233984229834752.0) == "234234234233984229834752" ) # not "2.3...e+23" assert f(-1234.4) == "-1234.4"
def test_format_float(): f = parse_number_format("{:,.2f}") assert f(1) == "1.00" # must treat int like float assert f(1.0) == "1.00" assert f(-1.6) == "-1.60" assert f(1_234_567) == "1,234,567.00" assert f(1125899906842624) == "1,125,899,906,842,624.00" # int64 assert f(1.234567) == "1.23" assert f(125899906842624.09) == "125,899,906,842,624.09" # float64 assert f(1.235) == "1.24" # round
def test_format_uint32_array(): assert format_number_array( pa.array( [1, 1, 2, 2, 3_000, 3_000, 4_000_000, 4_000_000, None, None, 6, 6], pa.uint32(), ), parse_number_format("{:,d}"), ).to_pylist() == [ "1", "1", "2", "2", "3,000", "3,000", "4,000,000", "4,000,000", None, None, "6", "6", ]
def format_chunked_array(chunked_array: pa.ChunkedArray, field: pa.Field) -> pa.ChunkedArray: if pa.types.is_integer(field.type) or pa.types.is_floating(field.type): nf = parse_number_format(field.metadata[b"format"].decode("utf-8")) format_array = lambda chunk: format_number_array(chunk, nf) elif pa.types.is_timestamp(field.type): format_array = format_timestamp_array elif pa.types.is_date32(field.type): format_array = lambda chunk: format_date_array( chunk, field.metadata[b"unit"].decode("utf-8")) else: return chunked_array # pa.utf8() or pa.dictionary() of pa.utf8() return pa.chunked_array( [format_array(chunk) for chunk in chunked_array.chunks], pa.utf8())
def _read_column_type( column: pa.ChunkedArray, field: pa.Field, *, full: bool ) -> ColumnType: """Read ColumnType from metadata, or raise ValidateError. If `full=False`, skip costly checks. Only pass `full=False` when you can guarantee the data has been generated by a source you trust. (In particular, module output is not trusted and it must use the default `full=True`.) """ if pa.types.is_timestamp(field.type): if field.metadata is not None: raise FieldMetadataNotAllowed(field.name, "None", field.metadata) if field.type.tz is not None: raise TimestampTimezoneNotAllowed(field.name, column.type) if field.type.unit != "ns": raise TimestampUnitNotAllowed(field.name, column.type) return ColumnType.Timestamp() if pa.types.is_date32(field.type): if ( field.metadata is None or len(field.metadata) != 1 or ( field.metadata.get(b"unit") not in {b"day", b"week", b"month", b"quarter", b"year"} ) ): raise FieldMetadataNotAllowed( field.name, "'unit' of day/week/month/quarter/year", field.metadata ) unit = field.metadata[b"unit"].decode("ascii") if full: if unit == "day": pass elif unit == "week": # Only Mondays (ISO weekday = 0) are valid for chunk in column.chunks: # 1970-01-01 (date32=0) was Thursday. Shift such that # date32=0 is Monday. If chunk == -3, monday0_i64 == 0. # # We use i64 to avoid overflow monday0_i64 = pa.compute.add( chunk.view(pa.int32()).cast(pa.int64()), 3 ) # divide+multiply. For each date in monday0_i64, # all_mondays will be the monday of that week all_mondays = pa.compute.multiply( pa.compute.divide(monday0_i64, 7), 7 ) if pa.compute.any( pa.compute.not_equal(monday0_i64, all_mondays) ).as_py(): raise DateValueHasWrongUnit(field.name, "week") return ColumnType.Date(unit="week") else: is_valid = { "month": lambda st: st.tm_mday == 1, "quarter": lambda st: st.tm_mday == 1 and st.tm_mon % 3 == 1, "year": lambda st: st.tm_mon == 1 and st.tm_mday == 1, }[unit] for chunk in column.chunks: unix_timestamps = pa.compute.multiply( chunk.view(pa.int32()).cast(pa.int64()), 86400 ) for unix_timestamp in unix_timestamps: if unix_timestamp.is_valid: struct_time = time.gmtime(unix_timestamp.as_py()) if not is_valid(struct_time): raise DateValueHasWrongUnit(field.name, unit) return ColumnType.Date(unit=unit) if pa.types.is_string(field.type) or ( pa.types.is_dictionary(field.type) and pa.types.is_integer(field.type.index_type) ): if field.metadata is not None: raise FieldMetadataNotAllowed(field.name, "None", field.metadata) return ColumnType.Text() if pa.types.is_integer(field.type) or pa.types.is_floating(field.type): if ( field.metadata is None or len(field.metadata) != 1 or b"format" not in field.metadata ): raise FieldMetadataNotAllowed( field.name, "'format' in metadata", field.metadata ) try: format = field.metadata[b"format"].decode("utf-8") except ValueError: raise InvalidNumberFormat( field.name, field.metadata[b"format"].decode("latin1") ) try: parse_number_format(format) except ValueError: raise InvalidNumberFormat(field.name, format) return ColumnType.Number(format=format) raise WrongColumnType(field.name, field.type)
def __post_init__(self): parse_number_format(self.format) # raise ValueError
def test_format_int8_array_no_validity_buffer(): arr = pa.array([1, 2, 30, 4], pa.int8()) valid_buf, num_buf = arr.buffers() format = parse_number_format("{:d}") scary_arr = pa.Array.from_buffers(arr.type, 4, [None, num_buf]) assert format_number_array(scary_arr, format).to_pylist() == ["1", "2", "30", "4"]
def test_parse_disallow_too_many_arguments(): with pytest.raises(ValueError, match="Can only format one number"): parse_number_format("{:d}{:f}")
def test_format_suffix(): f = parse_number_format("{:,d} cows") assert f(2) == "2 cows" assert f(1234) == "1,234 cows"
def test_format_float64_create_validity_buffer_when_missing(): arr = pa.array([1, math.inf, math.nan, 4], pa.float64()) valid_buf, num_buf = arr.buffers() format = parse_number_format("{:d}") scary_arr = pa.Array.from_buffers(arr.type, 4, [None, num_buf]) assert format_number_array(scary_arr, format).to_pylist() == ["1", None, None, "4"]
def test_format_typeerror(): with pytest.raises(TypeError): parse_number_format(b"{:,}")
def test_parse_disallow_non_format_is_valueerror(): with pytest.raises(ValueError, match='Format must look like "{:...}"'): parse_number_format("%d")
def test_disallow_field_name(): with pytest.raises(ValueError, match="Field names or numbers are not allowed"): parse_number_format("{value:f}")
def test_disallow_field_converter(): with pytest.raises(ValueError, match="Field converters are not allowed"): parse_number_format("{!r:f}")
def test_disallow_invalid_type(): with pytest.raises(ValueError, match="Unknown format code 'T'"): parse_number_format("{:T}")