Beispiel #1
0
def arrow_array_to_array_of_proto(
        arrow_type: pa.DataType,
        arrow_array: pa.Array) -> List[Value_pb2.Value]:
    values = []
    if isinstance(arrow_type, pa.ListType):
        proto_list_class = ARROW_LIST_TYPE_TO_PROTO_LIST_CLASS[
            arrow_type.value_type]
        proto_field_name = ARROW_LIST_TYPE_TO_PROTO_FIELD[
            arrow_type.value_type]

        if arrow_type.value_type == PA_TIMESTAMP_TYPE:
            arrow_array = arrow_array.cast(pa.list_(pa.int64()))

        for v in arrow_array.tolist():
            values.append(
                Value_pb2.Value(**{proto_field_name: proto_list_class(val=v)}))
    else:
        proto_field_name = ARROW_TYPE_TO_PROTO_FIELD[arrow_type]

        if arrow_type == PA_TIMESTAMP_TYPE:
            arrow_array = arrow_array.cast(pa.int64())

        for v in arrow_array.tolist():
            values.append(Value_pb2.Value(**{proto_field_name: v}))

    return values
def _downcast_array(array: pa.Array) -> pa.Array:
    if array.type in (pa.float64(), ):
        array = array.cast(pa.float32())
    elif array.type in (pa.int64(), ):
        array = array.cast(pa.uint16())
    elif array.type in (pa.string(), pa.bool_()):
        pass
    else:
        raise Exception(f"Did not downcast array with type '{array.type}'.")
    return array
Beispiel #3
0
def reencode_dictionary_array(array: pa.Array) -> pa.Array:
    if len(array.indices) <= len(array.dictionary):
        # Groupby often reduces the number of values considerably. Let's shy
        # away from dictionary when it gives us literally nothing.
        return array.cast(pa.utf8())

    used = np.zeros(len(array.dictionary), np.bool_)
    used[array.indices] = True
    if np.all(used):
        return array  # no edit

    return array.cast(pa.utf8()).dictionary_encode()  # TODO optimize
Beispiel #4
0
def sum(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array:
    if pa.types.is_integer(array.type):
        array = array.cast(pa.int64())
        zero = 0
    else:
        zero = 0.0

    def arrow_sum_never_none(scalar: pa.Scalar):
        if scalar.is_valid:
            return scalar.as_py()
        else:
            return zero

    return pa.array(
        (arrow_sum_never_none(pa.compute.sum(array[begin:end]))
         for begin, end in iter_splits(group_splits, len(array))),
        array.type,
    )
Beispiel #5
0
def array_cast(array: pa.Array,
               pa_type: pa.DataType,
               allow_number_to_str=True):
    """Improved version of pa.Array.cast

    It supports casting pa.StructArray objects to re-order the fields.
    It also let you control certain aspects of the casting, e.g. whether
    to disable numbers (floats or ints) to strings.

    Args:
        array (pa.Array): PyArrow array to cast
        pa_type (pa.DataType): target PyArrow type
        allow_number_to_str (bool, default ``True``): Whether to allow casting numbers to strings.
            Defaults to True.

    Raises:
        pa.ArrowInvalidError: if the arrow data casting fails
        TypeError: if the target type is not supported according, e.g.

            - if a field is missing
            = if casting from numbers to strings and allow_number_to_str is False

    Returns:
        pa.Array: the casted array
    """
    _c = partial(array_cast, allow_number_to_str=allow_number_to_str)
    if isinstance(array, pa.ExtensionArray):
        array = array.storage
    if isinstance(pa_type, pa.ExtensionType):
        return pa_type.wrap_array(array)
    elif pa.types.is_struct(array.type):
        if pa.types.is_struct(pa_type) and (set(field.name
                                                for field in pa_type) == set(
                                                    field.name
                                                    for field in array.type)):
            arrays = [
                _c(array.field(field.name),
                   field.type,
                   allow_number_to_str=allow_number_to_str)
                for field in pa_type
            ]
            return pa.StructArray.from_arrays(arrays, fields=list(pa_type))
    elif pa.types.is_list(array.type):
        if pa.types.is_fixed_size_list(pa_type):
            if pa_type.list_size * len(array) == len(array.values):
                return pa.FixedSizeListArray.from_arrays(
                    _c(array.values,
                       pa_type.value_type,
                       allow_number_to_str=allow_number_to_str),
                    pa_type.list_size,
                )
        elif pa.types.is_list(pa_type):
            return pa.ListArray.from_arrays(
                array.offsets,
                _c(array.values,
                   pa_type.value_type,
                   allow_number_to_str=allow_number_to_str))
    elif pa.types.is_fixed_size_list(array.type):
        if pa.types.is_fixed_size_list(pa_type):
            return pa.FixedSizeListArray.from_arrays(
                _c(array.values,
                   pa_type.value_type,
                   allow_number_to_str=allow_number_to_str),
                pa_type.list_size,
            )
        elif pa.types.is_list(pa_type):
            offsets_arr = pa.array(range(len(array) + 1), pa.int32())
            return pa.ListArray.from_arrays(
                offsets_arr,
                _c(array.values,
                   pa_type.value_type,
                   allow_number_to_str=allow_number_to_str))
    else:
        if (not allow_number_to_str and pa.types.is_string(pa_type)
                and (pa.types.is_floating(array.type)
                     or pa.types.is_integer(array.type))):
            raise TypeError(
                f"Couldn't cast array of type {array.type} to {pa_type} since allow_number_to_str is set to {allow_number_to_str}"
            )
        return array.cast(pa_type)
    raise TypeError(
        f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")