Beispiel #1
0
def test_format_float64_array():
    assert format_number_array(
        pa.array(
            [
                1,
                -1.1,
                2.3,
                -3.123,
                math.nan,
                math.inf,
                -math.inf,
                4_000_000.23123,
                -4_000_000.414213,
                None,
            ],
            pa.float64(),
        ),
        parse_number_format("{:,}"),
    ).to_pylist() == [
        "1",
        "-1.1",
        "2.3",
        "-3.123",
        None,
        None,
        None,
        "4,000,000.23123",
        "-4,000,000.414213",
        None,
    ]
Beispiel #2
0
def _infer_column(
    series: pd.Series, given_format: Optional[str], try_fallback: Optional[Column]
) -> Column:
    """
    Build a valid `Column` for the given Series, or raise `ValueError`.

    The logic: determine the `ColumnType` class of `series` (e.g.,
    `ColumnType.Number`) and then try to initialize it with `given_format`. If
    the format is invalid, raise `ValueError` because the user tried to create
    something invalid.

    If `try_fallback` is given and of the correct `ColumnType` class, use
    `try_fallback`.

    Otherwise, construct `Column` with default format.
    """
    # Determine ColumnType class, based on pandas/numpy `dtype`.
    dtype = series.dtype
    if is_numeric_dtype(dtype):
        if given_format is not None:
            parse_number_format(given_format)
            return Column(series.name, ColumnType.Number(format=given_format))
        elif try_fallback is not None and isinstance(
            try_fallback.type, ColumnType.Number
        ):
            return try_fallback
        else:
            return Column(series.name, ColumnType.Number(format="{:,}"))
    elif is_datetime64_dtype(dtype):
        if given_format is not None:
            raise ValueError(
                '"format" not allowed for column "%s" because it is of type "timestamp"'
                % (series.name,)
            )
        return Column(series.name, ColumnType.Timestamp())
    elif pd.PeriodDtype(freq="D") == dtype:
        if given_format is not None:
            if given_format not in {"day", "week", "month", "quarter", "year"}:
                raise ValueError(
                    'Unit must be "day", "week", "month", "quarter" or "year"; got %r for column "%s"'
                    % (given_format, series.name)
                )
            return Column(series.name, ColumnType.Date(unit=given_format))
        elif try_fallback is not None and isinstance(
            try_fallback.type, ColumnType.Date
        ):
            return try_fallback
        else:
            return Column(series.name, ColumnType.Date(unit="day"))
    elif dtype == object or dtype == "category":
        if given_format is not None:
            raise ValueError(
                '"format" not allowed for column "%s" because it is of type "text"'
                % (series.name,)
            )
        return Column(series.name, ColumnType.Text())
    else:
        raise ValueError(f"Unknown dtype: {dtype}")
Beispiel #3
0
def test_format_int8_array():
    assert (
        format_number_array(
            pa.array([1, -1, 2, -2, 3, -3, 4, -4, None, None, 6, -6], pa.int8()),
            parse_number_format("{:d}"),
        ).to_pylist()
        == ["1", "-1", "2", "-2", "3", "-3", "4", "-4", None, None, "6", "-6"]
    )
Beispiel #4
0
def test_format_int():
    f = parse_number_format("{:,d}")
    assert f(1) == "1"
    assert f(1.0) == "1"  # must treat float like int
    assert f(1.6) == "1"  # round towards 0
    assert f(-1.6) == "-1"  # round towards 0
    assert f(1_234_567) == "1,234,567"
    assert f(1125899906842624) == "1,125,899,906,842,624"  # int64
Beispiel #5
0
def test_format_general():
    f = parse_number_format("{}")
    assert f(1) == "1"
    assert f(1.0) == "1"  # must treat float like int
    assert f(3.2) == "3.2"
    assert (
        f(234234234233984229834752.0) == "234234234233984229834752"
    )  # not "2.3...e+23"
    assert f(-1234.4) == "-1234.4"
Beispiel #6
0
def test_format_float():
    f = parse_number_format("{:,.2f}")
    assert f(1) == "1.00"  # must treat int like float
    assert f(1.0) == "1.00"
    assert f(-1.6) == "-1.60"
    assert f(1_234_567) == "1,234,567.00"
    assert f(1125899906842624) == "1,125,899,906,842,624.00"  # int64
    assert f(1.234567) == "1.23"
    assert f(125899906842624.09) == "125,899,906,842,624.09"  # float64
    assert f(1.235) == "1.24"  # round
Beispiel #7
0
def test_format_uint32_array():
    assert format_number_array(
        pa.array(
            [1, 1, 2, 2, 3_000, 3_000, 4_000_000, 4_000_000, None, None, 6, 6],
            pa.uint32(),
        ),
        parse_number_format("{:,d}"),
    ).to_pylist() == [
        "1",
        "1",
        "2",
        "2",
        "3,000",
        "3,000",
        "4,000,000",
        "4,000,000",
        None,
        None,
        "6",
        "6",
    ]
def format_chunked_array(chunked_array: pa.ChunkedArray,
                         field: pa.Field) -> pa.ChunkedArray:
    if pa.types.is_integer(field.type) or pa.types.is_floating(field.type):
        nf = parse_number_format(field.metadata[b"format"].decode("utf-8"))
        format_array = lambda chunk: format_number_array(chunk, nf)
    elif pa.types.is_timestamp(field.type):
        format_array = format_timestamp_array
    elif pa.types.is_date32(field.type):
        format_array = lambda chunk: format_date_array(
            chunk, field.metadata[b"unit"].decode("utf-8"))
    else:
        return chunked_array  # pa.utf8() or pa.dictionary() of pa.utf8()

    return pa.chunked_array(
        [format_array(chunk) for chunk in chunked_array.chunks], pa.utf8())
Beispiel #9
0
def _read_column_type(
    column: pa.ChunkedArray, field: pa.Field, *, full: bool
) -> ColumnType:
    """Read ColumnType from metadata, or raise ValidateError.

    If `full=False`, skip costly checks. Only pass `full=False` when you can
    guarantee the data has been generated by a source you trust. (In particular,
    module output is not trusted and it must use the default `full=True`.)
    """
    if pa.types.is_timestamp(field.type):
        if field.metadata is not None:
            raise FieldMetadataNotAllowed(field.name, "None", field.metadata)
        if field.type.tz is not None:
            raise TimestampTimezoneNotAllowed(field.name, column.type)
        if field.type.unit != "ns":
            raise TimestampUnitNotAllowed(field.name, column.type)
        return ColumnType.Timestamp()

    if pa.types.is_date32(field.type):
        if (
            field.metadata is None
            or len(field.metadata) != 1
            or (
                field.metadata.get(b"unit")
                not in {b"day", b"week", b"month", b"quarter", b"year"}
            )
        ):
            raise FieldMetadataNotAllowed(
                field.name, "'unit' of day/week/month/quarter/year", field.metadata
            )
        unit = field.metadata[b"unit"].decode("ascii")
        if full:
            if unit == "day":
                pass
            elif unit == "week":
                # Only Mondays (ISO weekday = 0) are valid
                for chunk in column.chunks:
                    # 1970-01-01 (date32=0) was Thursday. Shift such that
                    # date32=0 is Monday. If chunk == -3, monday0_i64 == 0.
                    #
                    # We use i64 to avoid overflow
                    monday0_i64 = pa.compute.add(
                        chunk.view(pa.int32()).cast(pa.int64()), 3
                    )
                    # divide+multiply. For each date in monday0_i64,
                    # all_mondays will be the monday of that week
                    all_mondays = pa.compute.multiply(
                        pa.compute.divide(monday0_i64, 7), 7
                    )
                    if pa.compute.any(
                        pa.compute.not_equal(monday0_i64, all_mondays)
                    ).as_py():
                        raise DateValueHasWrongUnit(field.name, "week")
                return ColumnType.Date(unit="week")
            else:
                is_valid = {
                    "month": lambda st: st.tm_mday == 1,
                    "quarter": lambda st: st.tm_mday == 1 and st.tm_mon % 3 == 1,
                    "year": lambda st: st.tm_mon == 1 and st.tm_mday == 1,
                }[unit]
                for chunk in column.chunks:
                    unix_timestamps = pa.compute.multiply(
                        chunk.view(pa.int32()).cast(pa.int64()), 86400
                    )
                    for unix_timestamp in unix_timestamps:
                        if unix_timestamp.is_valid:
                            struct_time = time.gmtime(unix_timestamp.as_py())
                            if not is_valid(struct_time):
                                raise DateValueHasWrongUnit(field.name, unit)

        return ColumnType.Date(unit=unit)

    if pa.types.is_string(field.type) or (
        pa.types.is_dictionary(field.type)
        and pa.types.is_integer(field.type.index_type)
    ):
        if field.metadata is not None:
            raise FieldMetadataNotAllowed(field.name, "None", field.metadata)
        return ColumnType.Text()

    if pa.types.is_integer(field.type) or pa.types.is_floating(field.type):
        if (
            field.metadata is None
            or len(field.metadata) != 1
            or b"format" not in field.metadata
        ):
            raise FieldMetadataNotAllowed(
                field.name, "'format' in metadata", field.metadata
            )

        try:
            format = field.metadata[b"format"].decode("utf-8")
        except ValueError:
            raise InvalidNumberFormat(
                field.name, field.metadata[b"format"].decode("latin1")
            )

        try:
            parse_number_format(format)
        except ValueError:
            raise InvalidNumberFormat(field.name, format)

        return ColumnType.Number(format=format)

    raise WrongColumnType(field.name, field.type)
Beispiel #10
0
 def __post_init__(self):
     parse_number_format(self.format)  # raise ValueError
Beispiel #11
0
def test_format_int8_array_no_validity_buffer():
    arr = pa.array([1, 2, 30, 4], pa.int8())
    valid_buf, num_buf = arr.buffers()
    format = parse_number_format("{:d}")
    scary_arr = pa.Array.from_buffers(arr.type, 4, [None, num_buf])
    assert format_number_array(scary_arr, format).to_pylist() == ["1", "2", "30", "4"]
Beispiel #12
0
def test_parse_disallow_too_many_arguments():
    with pytest.raises(ValueError, match="Can only format one number"):
        parse_number_format("{:d}{:f}")
Beispiel #13
0
def test_format_suffix():
    f = parse_number_format("{:,d} cows")
    assert f(2) == "2 cows"
    assert f(1234) == "1,234 cows"
Beispiel #14
0
def test_format_float64_create_validity_buffer_when_missing():
    arr = pa.array([1, math.inf, math.nan, 4], pa.float64())
    valid_buf, num_buf = arr.buffers()
    format = parse_number_format("{:d}")
    scary_arr = pa.Array.from_buffers(arr.type, 4, [None, num_buf])
    assert format_number_array(scary_arr, format).to_pylist() == ["1", None, None, "4"]
Beispiel #15
0
def test_format_typeerror():
    with pytest.raises(TypeError):
        parse_number_format(b"{:,}")
Beispiel #16
0
def test_parse_disallow_non_format_is_valueerror():
    with pytest.raises(ValueError, match='Format must look like "{:...}"'):
        parse_number_format("%d")
Beispiel #17
0
def test_disallow_field_name():
    with pytest.raises(ValueError, match="Field names or numbers are not allowed"):
        parse_number_format("{value:f}")
Beispiel #18
0
def test_disallow_field_converter():
    with pytest.raises(ValueError, match="Field converters are not allowed"):
        parse_number_format("{!r:f}")
Beispiel #19
0
def test_disallow_invalid_type():
    with pytest.raises(ValueError, match="Unknown format code 'T'"):
        parse_number_format("{:T}")