def cast_array_to_feature(array: pa.Array, feature: "FeatureType", allow_number_to_str=True): """Cast an array to the arrow type that corresponds to the requested feature type. For custom features like Audio or Image, it takes into account the "cast_storage" methods they defined to enable casting from other arrow types. Args: array (pa.Array): the PyArrow array to cast feature (FeatureType): the target feature type allow_number_to_str (bool, default ``True``): Whether to allow casting numbers to strings. Defaults to True. Raises: pa.ArrowInvalidError: if the arrow data casting fails TypeError: if the target type is not supported according, e.g. - if a field is missing = if casting from numbers to strings and allow_number_to_str is False Returns: pa.Array: the casted array """ from .features import Sequence, get_nested_type _c = partial(cast_array_to_feature, allow_number_to_str=allow_number_to_str) if isinstance(array, pa.ExtensionArray): array = array.storage if hasattr(feature, "cast_storage"): return feature.cast_storage(array) elif pa.types.is_struct(array.type): # feature must be a dict or Sequence(subfeatures_dict) if isinstance(feature, Sequence) and isinstance(feature.feature, dict): feature = { name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items() } if isinstance(feature, dict) and set( field.name for field in array.type) == set(feature): arrays = [ _c(array.field(name), subfeature) for name, subfeature in feature.items() ] return pa.StructArray.from_arrays(arrays, names=list(feature)) elif pa.types.is_list(array.type): # feature must be either [subfeature] or Sequence(subfeature) if isinstance(feature, list): return pa.ListArray.from_arrays(array.offsets, _c(array.values, feature[0])) elif isinstance(feature, Sequence): if feature.length > -1: if feature.length * len(array) == len(array.values): return pa.FixedSizeListArray.from_arrays( _c(array.values, feature.feature), feature.length) else: return pa.ListArray.from_arrays( array.offsets, _c(array.values, feature.feature)) elif pa.types.is_fixed_size_list(array.type): # feature must be either [subfeature] or Sequence(subfeature) if isinstance(feature, list): return pa.ListArray.from_arrays(array.offsets, _c(array.values, feature[0])) elif isinstance(feature, Sequence): if feature.length > -1: if feature.length * len(array) == len(array.values): return pa.FixedSizeListArray.from_arrays( _c(array.values, feature.feature), feature.length) else: offsets_arr = pa.array(range(len(array) + 1), pa.int32()) return pa.ListArray.from_arrays( offsets_arr, _c(array.values, feature.feature)) if pa.types.is_null(array.type): return array_cast(array, get_nested_type(feature), allow_number_to_str=allow_number_to_str) elif not isinstance(feature, (Sequence, dict, list, tuple)): return array_cast(array, feature(), allow_number_to_str=allow_number_to_str) raise TypeError( f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
def array_cast(array: pa.Array, pa_type: pa.DataType, allow_number_to_str=True): """Improved version of pa.Array.cast It supports casting pa.StructArray objects to re-order the fields. It also let you control certain aspects of the casting, e.g. whether to disable numbers (floats or ints) to strings. Args: array (pa.Array): PyArrow array to cast pa_type (pa.DataType): target PyArrow type allow_number_to_str (bool, default ``True``): Whether to allow casting numbers to strings. Defaults to True. Raises: pa.ArrowInvalidError: if the arrow data casting fails TypeError: if the target type is not supported according, e.g. - if a field is missing = if casting from numbers to strings and allow_number_to_str is False Returns: pa.Array: the casted array """ _c = partial(array_cast, allow_number_to_str=allow_number_to_str) if isinstance(array, pa.ExtensionArray): array = array.storage if isinstance(pa_type, pa.ExtensionType): return pa_type.wrap_array(array) elif pa.types.is_struct(array.type): if pa.types.is_struct(pa_type) and (set(field.name for field in pa_type) == set( field.name for field in array.type)): arrays = [ _c(array.field(field.name), field.type, allow_number_to_str=allow_number_to_str) for field in pa_type ] return pa.StructArray.from_arrays(arrays, fields=list(pa_type)) elif pa.types.is_list(array.type): if pa.types.is_fixed_size_list(pa_type): if pa_type.list_size * len(array) == len(array.values): return pa.FixedSizeListArray.from_arrays( _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str), pa_type.list_size, ) elif pa.types.is_list(pa_type): return pa.ListArray.from_arrays( array.offsets, _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str)) elif pa.types.is_fixed_size_list(array.type): if pa.types.is_fixed_size_list(pa_type): return pa.FixedSizeListArray.from_arrays( _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str), pa_type.list_size, ) elif pa.types.is_list(pa_type): offsets_arr = pa.array(range(len(array) + 1), pa.int32()) return pa.ListArray.from_arrays( offsets_arr, _c(array.values, pa_type.value_type, allow_number_to_str=allow_number_to_str)) else: if (not allow_number_to_str and pa.types.is_string(pa_type) and (pa.types.is_floating(array.type) or pa.types.is_integer(array.type))): raise TypeError( f"Couldn't cast array of type {array.type} to {pa_type} since allow_number_to_str is set to {allow_number_to_str}" ) return array.cast(pa_type) raise TypeError( f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")