def _coerce_decimal(cls, vector: pa.Array, field: pa.Field) -> pa.Array: # Loss of precision is allowed, but loss of data is not # Arrow will raise an error if cast() results in loss of data try: # For decimal values, arrow will raise an error on loss of precision # Round explicitly to the required scale so there is no loss of precision in cast() if pa.types.is_decimal(vector.type): rounded = pc.round(vector, ndigits=field.type.scale) # noqa return pc.cast(rounded, field.type) # Floats and integers can always be coerced to decimal, so long as there is no data loss elif pa.types.is_floating(vector.type) or pa.types.is_integer( vector.type): return pc.cast(vector, field.type) else: error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) except pa.ArrowInvalid as e: error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) from e
def _coerce_timestamp_windows(cls, vector: pa.Array, field: pa.Field): scaling_map = {"s": 1, "ms": 1000, "us": 1000000, "ns": 1000000000} src_scale = scaling_map.get(vector.type.unit) tgt_scale = scaling_map.get(field.type.unit) if src_scale is None or tgt_scale is None: raise _ex.EUnexpected() # Invalid timestamp type int64_vector: pa.IntegerArray = pc.cast(vector, pa.int64()) if src_scale > tgt_scale: scaling = src_scale / tgt_scale scaling_vector = pa.array([scaling for _ in range(len(vector))], pa.int64()) scaled_vector = pc.divide_checked(int64_vector, scaling_vector) # noqa else: scaling = tgt_scale / src_scale scaling_vector = pa.array([scaling for _ in range(len(vector))], pa.int64()) scaled_vector = pc.multiply_checked(int64_vector, scaling_vector) # noqa return pc.cast(scaled_vector, field.type)
def test_cast(): arr = pa.array([2**63 - 1], type='int64') with pytest.raises(pa.ArrowInvalid): pc.cast(arr, 'int32') assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32') arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]') assert pc.cast(arr, 'timestamp[ms]') == expected
def _coerce_float(cls, vector: pa.Array, field: pa.Field) -> pa.FloatingPointArray: try: # Coercing between float types if pa.types.is_floating(vector.type): # Casting floats to a wider type is allowed # Casting to a less wide type does not raise exceptions when values do not fit # So we need an explict check on which casts are allowed source_bit_width = vector.type.bit_width target_bit_width = field.type.bit_width if source_bit_width == target_bit_width: return vector # noqa # cast() is available for float32 -> float64, but not for float16 -> float32/float64 elif source_bit_width == 32 and target_bit_width == 64: return pc.cast(vector, field.type) elif source_bit_width > target_bit_width: error_message = cls._format_error( cls.__E_DATA_LOSS_WILL_OCCUR, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) # All integer types can be coerced to float32 or float64 if pa.types.is_integer( vector.type) and not pa.types.is_float16(field.type): return pc.cast(vector, field.type) if pa.types.is_integer( vector.type) and vector.type.bit_width <= 16: return pc.cast(vector, field.type) error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) except pa.ArrowInvalid as e: error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) from e
def _coerce_timestamp(cls, vector: pa.Array, field: pa.Field) -> pa.Array: try: if pa.types.is_timestamp(vector.type): if not isinstance(field.type, pa.TimestampType): raise _ex.EUnexpected() if vector.type.tz != field.type.tz: error_message = cls._format_error( cls.__E_TIMEZONE_DOES_NOT_MATCH, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) # The cast() function applied to timestamp on Windows does not correctly detect overflows / under-flows # To get consistent behavior, this custom implementation casts to int64, the underlying type # Then performs the required scaling on the int64 vector, which does throw for overflows # Bug exists in Arrow 7.0.0 as of May 2022 # This also avoids the need for timezone lookup on Windows # Although zone conversion is not supported, a tz database is still required # When casting timestamps with source and target type in the same zone if platform.system().lower().startswith("win"): return cls._coerce_timestamp_windows(vector, field) if field.type.unit == "s": rounding_unit = "second" elif field.type.unit == "ms": rounding_unit = "millisecond" elif field.type.unit == "us": rounding_unit = "microsecond" elif field.type.unit == "ns": rounding_unit = "nanosecond" else: raise _ex.EUnexpected() # Loss of precision is allowed, loss of data is not # Rounding will prevent errors in cast() due to loss of precision # cast() will fail if the source value is outside the range of the target type rounded_vector = pc.round_temporal(vector, unit=rounding_unit) # noqa return pc.cast(rounded_vector, field.type) else: error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) except pa.ArrowInvalid as e: error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) from e
def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array: # Allow casting date32 -> date64, both range and precision are greater so there is no data loss if pa.types.is_date(vector.type): if field.type.bit_width >= vector.type.bit_width: return pc.cast(vector, field.type) # Special handling for Pandas/NumPy date values # These are encoded as np.datetime64[ns] in Pandas -> pa.timestamp64[ns] in Arrow # Only allow this conversion if the vector is coming from Pandas with datetime type if pandas_type == DataMapping.pandas_datetime_type(): if pa.types.is_timestamp(vector.type) and vector.type.unit == "ns": return pc.cast(vector, field.type) error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message)
def _coerce_string(cls, vector: pa.Array, field: pa.Field) -> pa.Array: if pa.types.is_string(field.type): if pa.types.is_string(vector.type): return vector if pa.types.is_large_string(field.type): if pa.types.is_large_string(vector.type): return vector # Allow up-casting string -> large_string if pa.types.is_string(vector.type): return pc.cast(vector, field.type) error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message)
def _coerce_integer(cls, vector: pa.Array, field: pa.Field) -> pa.IntegerArray: try: if pa.types.is_integer(vector.type): return pc.cast(vector, field.type) else: error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) except pa.ArrowInvalid as e: error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e) cls.__log.error(error_message) raise _ex.EDataConformance(error_message) from e
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"]) # truncate the title after 101 characters (matching display logic) truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="") table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title) # ensure all dictionaries in the file use the same key/value mappings table = table.unify_dictionaries() # filter out non-numeric dates (e.g. null, "1850-1853") # matches the hack in index.js:37 mask = pc.invert(pc.is_null(table.column("date"))) table = table.filter(mask) # sorting by the date improves the loading aesthetics # comment this out to exactly match the original appearance indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = pc.take(table, indices) # after sorting replace ix with an accurate row index indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32())) temp_path.unlink() local = fs.LocalFileSystem() with local.open_output_stream(str(target_path)) as file: with pa.RecordBatchStreamWriter(file, table.schema) as writer: writer.write_table(table, 10000)
def my_string_length(arr, **kwargs): # arr is a pyarrow.StringArray return pc.cast(pc.multiply(pc.utf8_length(arr), 2), target_type='int64')