Beispiel #1
0
    def _coerce_decimal(cls, vector: pa.Array, field: pa.Field) -> pa.Array:

        # Loss of precision is allowed, but loss of data is not
        # Arrow will raise an error if cast() results in loss of data

        try:

            # For decimal values, arrow will raise an error on loss of precision
            # Round explicitly to the required scale so there is no loss of precision in cast()
            if pa.types.is_decimal(vector.type):
                rounded = pc.round(vector, ndigits=field.type.scale)  # noqa
                return pc.cast(rounded, field.type)

            # Floats and integers can always be coerced to decimal, so long as there is no data loss
            elif pa.types.is_floating(vector.type) or pa.types.is_integer(
                    vector.type):
                return pc.cast(vector, field.type)

            else:
                error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE,
                                                  vector, field)
                cls.__log.error(error_message)
                raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Beispiel #2
0
    def _coerce_timestamp_windows(cls, vector: pa.Array, field: pa.Field):

        scaling_map = {"s": 1, "ms": 1000, "us": 1000000, "ns": 1000000000}
        src_scale = scaling_map.get(vector.type.unit)
        tgt_scale = scaling_map.get(field.type.unit)

        if src_scale is None or tgt_scale is None:
            raise _ex.EUnexpected()  # Invalid timestamp type

        int64_vector: pa.IntegerArray = pc.cast(vector, pa.int64())

        if src_scale > tgt_scale:

            scaling = src_scale / tgt_scale
            scaling_vector = pa.array([scaling for _ in range(len(vector))],
                                      pa.int64())
            scaled_vector = pc.divide_checked(int64_vector,
                                              scaling_vector)  # noqa

        else:

            scaling = tgt_scale / src_scale
            scaling_vector = pa.array([scaling for _ in range(len(vector))],
                                      pa.int64())
            scaled_vector = pc.multiply_checked(int64_vector,
                                                scaling_vector)  # noqa

        return pc.cast(scaled_vector, field.type)
Beispiel #3
0
def test_cast():
    arr = pa.array([2**63 - 1], type='int64')

    with pytest.raises(pa.ArrowInvalid):
        pc.cast(arr, 'int32')

    assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32')

    arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
    expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]')
    assert pc.cast(arr, 'timestamp[ms]') == expected
Beispiel #4
0
    def _coerce_float(cls, vector: pa.Array,
                      field: pa.Field) -> pa.FloatingPointArray:

        try:

            # Coercing between float types
            if pa.types.is_floating(vector.type):

                # Casting floats to a wider type is allowed
                # Casting to a less wide type does not raise exceptions when values do not fit
                # So we need an explict check on which casts are allowed

                source_bit_width = vector.type.bit_width
                target_bit_width = field.type.bit_width

                if source_bit_width == target_bit_width:
                    return vector  # noqa

                # cast() is available for float32 -> float64, but not for float16 -> float32/float64
                elif source_bit_width == 32 and target_bit_width == 64:
                    return pc.cast(vector, field.type)

                elif source_bit_width > target_bit_width:
                    error_message = cls._format_error(
                        cls.__E_DATA_LOSS_WILL_OCCUR, vector, field)
                    cls.__log.error(error_message)
                    raise _ex.EDataConformance(error_message)

            # All integer types can be coerced to float32 or float64
            if pa.types.is_integer(
                    vector.type) and not pa.types.is_float16(field.type):
                return pc.cast(vector, field.type)

            if pa.types.is_integer(
                    vector.type) and vector.type.bit_width <= 16:
                return pc.cast(vector, field.type)

            error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                              field)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Beispiel #5
0
    def _coerce_timestamp(cls, vector: pa.Array, field: pa.Field) -> pa.Array:

        try:

            if pa.types.is_timestamp(vector.type):

                if not isinstance(field.type, pa.TimestampType):
                    raise _ex.EUnexpected()

                if vector.type.tz != field.type.tz:
                    error_message = cls._format_error(
                        cls.__E_TIMEZONE_DOES_NOT_MATCH, vector, field)
                    cls.__log.error(error_message)
                    raise _ex.EDataConformance(error_message)

                # The cast() function applied to timestamp on Windows does not correctly detect overflows / under-flows
                # To get consistent behavior, this custom implementation casts to int64, the underlying type
                # Then performs the required scaling on the int64 vector, which does throw for overflows
                # Bug exists in Arrow 7.0.0 as of May 2022

                # This also avoids the need for timezone lookup on Windows
                # Although zone conversion is not supported, a tz database is still required
                # When casting timestamps with source and target type in the same zone

                if platform.system().lower().startswith("win"):
                    return cls._coerce_timestamp_windows(vector, field)

                if field.type.unit == "s":
                    rounding_unit = "second"
                elif field.type.unit == "ms":
                    rounding_unit = "millisecond"
                elif field.type.unit == "us":
                    rounding_unit = "microsecond"
                elif field.type.unit == "ns":
                    rounding_unit = "nanosecond"
                else:
                    raise _ex.EUnexpected()

                # Loss of precision is allowed, loss of data is not
                # Rounding will prevent errors in cast() due to loss of precision
                # cast() will fail if the source value is outside the range of the target type

                rounded_vector = pc.round_temporal(vector,
                                                   unit=rounding_unit)  # noqa
                return pc.cast(rounded_vector, field.type)

            else:
                error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE,
                                                  vector, field)
                cls.__log.error(error_message)
                raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Beispiel #6
0
    def _coerce_date(cls,
                     vector: pa.Array,
                     field: pa.Field,
                     pandas_type=None) -> pa.Array:

        # Allow casting date32 -> date64, both range and precision are greater so there is no data loss
        if pa.types.is_date(vector.type):
            if field.type.bit_width >= vector.type.bit_width:
                return pc.cast(vector, field.type)

        # Special handling for Pandas/NumPy date values
        # These are encoded as np.datetime64[ns] in Pandas -> pa.timestamp64[ns] in Arrow
        # Only allow this conversion if the vector is coming from Pandas with datetime type
        if pandas_type == DataMapping.pandas_datetime_type():
            if pa.types.is_timestamp(vector.type) and vector.type.unit == "ns":
                return pc.cast(vector, field.type)

        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                          field)
        cls.__log.error(error_message)

        raise _ex.EDataConformance(error_message)
Beispiel #7
0
    def _coerce_string(cls, vector: pa.Array, field: pa.Field) -> pa.Array:

        if pa.types.is_string(field.type):
            if pa.types.is_string(vector.type):
                return vector

        if pa.types.is_large_string(field.type):
            if pa.types.is_large_string(vector.type):
                return vector
            # Allow up-casting string -> large_string
            if pa.types.is_string(vector.type):
                return pc.cast(vector, field.type)

        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                          field)
        cls.__log.error(error_message)

        raise _ex.EDataConformance(error_message)
Beispiel #8
0
    def _coerce_integer(cls, vector: pa.Array,
                        field: pa.Field) -> pa.IntegerArray:

        try:

            if pa.types.is_integer(vector.type):
                return pc.cast(vector, field.type)

            else:
                error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE,
                                                  vector, field)
                cls.__log.error(error_message)
                raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])

# truncate the title after 101 characters (matching display logic)
truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="")
table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title)

# ensure all dictionaries in the file use the same key/value mappings
table = table.unify_dictionaries()

# filter out non-numeric dates (e.g. null, "1850-1853")
# matches the hack in index.js:37
mask = pc.invert(pc.is_null(table.column("date")))
table = table.filter(mask)

# sorting by the date improves the loading aesthetics
# comment this out to exactly match the original appearance
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = pc.take(table, indices)

# after sorting replace ix with an accurate row index
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32()))

temp_path.unlink()

local = fs.LocalFileSystem()

with local.open_output_stream(str(target_path)) as file:
    with pa.RecordBatchStreamWriter(file, table.schema) as writer:
        writer.write_table(table, 10000)
Beispiel #10
0
def my_string_length(arr, **kwargs):
    # arr is a pyarrow.StringArray
    return pc.cast(pc.multiply(pc.utf8_length(arr), 2), target_type='int64')