def arrow_array_to_array_of_proto( arrow_type: pa.DataType, arrow_array: pa.Array) -> List[Value_pb2.Value]: values = [] if isinstance(arrow_type, pa.ListType): proto_list_class = ARROW_LIST_TYPE_TO_PROTO_LIST_CLASS[ arrow_type.value_type] proto_field_name = ARROW_LIST_TYPE_TO_PROTO_FIELD[ arrow_type.value_type] if arrow_type.value_type == PA_TIMESTAMP_TYPE: arrow_array = arrow_array.cast(pa.list_(pa.int64())) for v in arrow_array.tolist(): values.append( Value_pb2.Value(**{proto_field_name: proto_list_class(val=v)})) else: proto_field_name = ARROW_TYPE_TO_PROTO_FIELD[arrow_type] if arrow_type == PA_TIMESTAMP_TYPE: arrow_array = arrow_array.cast(pa.int64()) for v in arrow_array.tolist(): values.append(Value_pb2.Value(**{proto_field_name: v})) return values
def _downcast_array(array: pa.Array) -> pa.Array: if array.type in (pa.float64(), ): array = array.cast(pa.float32()) elif array.type in (pa.int64(), ): array = array.cast(pa.uint16()) elif array.type in (pa.string(), pa.bool_()): pass else: raise Exception(f"Did not downcast array with type '{array.type}'.") return array
def reencode_dictionary_array(array: pa.Array) -> pa.Array: if len(array.indices) <= len(array.dictionary): # Groupby often reduces the number of values considerably. Let's shy # away from dictionary when it gives us literally nothing. return array.cast(pa.utf8()) used = np.zeros(len(array.dictionary), np.bool_) used[array.indices] = True if np.all(used): return array # no edit return array.cast(pa.utf8()).dictionary_encode() # TODO optimize
def sum(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: if pa.types.is_integer(array.type): array = array.cast(pa.int64()) zero = 0 else: zero = 0.0 def arrow_sum_never_none(scalar: pa.Scalar): if scalar.is_valid: return scalar.as_py() else: return zero return pa.array( (arrow_sum_never_none(pa.compute.sum(array[begin:end])) for begin, end in iter_splits(group_splits, len(array))), array.type, )
def array_cast(array: pa.Array, pa_type: pa.DataType, allow_number_to_str=True): """Improved version of pa.Array.cast It supports casting pa.StructArray objects to re-order the fields. It also let you control certain aspects of the casting, e.g. whether to disable numbers (floats or ints) to strings. Args: array (pa.Array): PyArrow array to cast pa_type (pa.DataType): target PyArrow type allow_number_to_str (bool, default ``True``): Whether to allow casting numbers to strings. Defaults to True. Raises: pa.ArrowInvalidError: if the arrow data casting fails TypeError: if the target type is not supported according, e.g. - if a field is missing = if casting from numbers to strings and allow_number_to_str is False Returns: pa.Array: the casted array """ _c = partial(array_cast, allow_number_to_str=allow_number_to_str) if isinstance(array, pa.ExtensionArray): array = array.storage if isinstance(pa_type, pa.ExtensionType): return pa_type.wrap_array(array) elif pa.types.is_struct(array.type): if pa.types.is_struct(pa_type) and (set(field.name for field in pa_type) == set( field.name for field in array.type)): arrays = [ _c(array.field(field.name), field.type, allow_number_to_str=allow_number_to_str) for field in pa_type ] return pa.StructArray.from_arrays(arrays, fields=list(pa_type)) elif pa.types.is_list(array.type): if pa.types.is_fixed_size_list(pa_type): if pa_type.list_size * len(array) == len(array.values): return pa.FixedSizeListArray.from_arrays( _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str), pa_type.list_size, ) elif pa.types.is_list(pa_type): return pa.ListArray.from_arrays( array.offsets, _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str)) elif pa.types.is_fixed_size_list(array.type): if pa.types.is_fixed_size_list(pa_type): return pa.FixedSizeListArray.from_arrays( _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str), pa_type.list_size, ) elif pa.types.is_list(pa_type): offsets_arr = pa.array(range(len(array) + 1), pa.int32()) return pa.ListArray.from_arrays( offsets_arr, _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str)) else: if (not allow_number_to_str and pa.types.is_string(pa_type) and (pa.types.is_floating(array.type) or pa.types.is_integer(array.type))): raise TypeError( f"Couldn't cast array of type {array.type} to {pa_type} since allow_number_to_str is set to {allow_number_to_str}" ) return array.cast(pa_type) raise TypeError( f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")