def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] + b_offset[2]] new_chunks.append( dispatch_chunked_binary_map(a_slice, b_slice, ops)) return pa.chunked_array(new_chunks) elif np.isscalar(b): new_chunks = [] for chunk in a.iterchunks(): new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops)) return pa.chunked_array(new_chunks) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(chunk, b[offset:offset + len(chunk)], ops)) return pa.chunked_array(new_chunks)
def _1(a: pa.ChunkedArray, b: Any, op: Callable): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] + b_offset[2]] new_chunks.append(np_ufunc_op(a_slice, b_slice, op)) return pa.chunked_array(new_chunks) elif np.isscalar(b): new_chunks = [] for chunk in a.iterchunks(): new_chunks.append(np_ufunc_op(chunk, b, op)) return pa.chunked_array(new_chunks) else: new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( np_ufunc_op(chunk, b[offset:offset + len(chunk)], op)) return pa.chunked_array(new_chunks)
def _maybe_dictionary_encode_column( data: pyarrow.ChunkedArray) -> pyarrow.ChunkedArray: if data.null_count == len(data): return data if data.chunk(0).offset > 0: # https://issues.apache.org/jira/browse/ARROW-7266# assert len(data.chunks) == 1 data_copy = pyarrow.chunked_array( [pyarrow.serialize(data.chunk(0)).deserialize()]) encoded = data_copy.dictionary_encode() else: encoded = data.dictionary_encode() new_cost = _string_array_pylist_n_bytes(encoded.chunk(0).dictionary) if new_cost > settings.MAX_DICTIONARY_PYLIST_N_BYTES: # abort! abort! dictionary is too large return data old_cost = _string_array_pylist_n_bytes(data.chunk(0)) if old_cost / new_cost >= settings.MIN_DICTIONARY_COMPRESSION_RATIO_PYLIST_N_BYTES: return encoded else: return data
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray: in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]] new_chunks.append(_text_cat(a_slice, b_slice)) return pa.chunked_array(new_chunks)
def _in_chunk_offsets(arr: pa.ChunkedArray, offsets: List[int]) -> List[Tuple[int, int, int]]: """Calculate the access ranges for a given list of offsets. All chunk start indices must be included as offsets and the offsets must be unique. Returns a list of tuples that contain: * The index of the given chunk * The position inside the chunk * The length of the current range """ new_offsets = [] pos = 0 chunk = 0 chunk_pos = 0 for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]): diff = offset - pos chunk_remains = len(arr.chunk(chunk)) - chunk_pos step = offset_next - offset if diff == 0: # The first offset new_offsets.append((chunk, chunk_pos, step)) elif diff == chunk_remains: chunk += 1 chunk_pos = 0 pos += chunk_remains new_offsets.append((chunk, chunk_pos, step)) else: # diff < chunk_remains chunk_pos += diff pos += diff new_offsets.append((chunk, chunk_pos, step)) return new_offsets
def _text_cat_chunked_mixed(a: pa.ChunkedArray, b: pa.Array) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append(_text_cat(chunk, b[offset:offset + len(chunk)])) return pa.chunked_array(new_chunks)
def _calculate_chunk_offsets(chunked_array: pa.ChunkedArray) -> np.ndarray: """Return an array holding the indices pointing to the first element of each chunk.""" offset = 0 offsets = [] for chunk in chunked_array.iterchunks(): offsets.append(offset) offset += len(chunk) return np.array(offsets)
def cast_for_truediv(arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar) -> pa.ChunkedArray: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type): return arrow_array.cast(pa.float64()) return arrow_array
def _timestamp_is_rounded(column: pa.ChunkedArray, granularity: DateGranularity) -> bool: factor = { DateGranularity.SECOND: 1_000_000_000, DateGranularity.MINUTE: 1_000_000_000 * 60, DateGranularity.HOUR: 1_000_000_000 * 60 * 60, }[granularity] ints = column.cast(pa.int64()) return pa.compute.all( pa.compute.equal( ints, pa.compute.multiply(pa.compute.divide(ints, factor), factor))).as_py()
def _string_array_pylist_n_bytes(data: pyarrow.ChunkedArray) -> int: text_buf = data.buffers()[-1] if text_buf is None: # All values are "" n_text_bytes = 0 else: n_text_bytes = text_buf.size return ( # 8 bytes per value (each value is a 64-bit pointer) (8 * len(data)) # 50 bytes of overhead per string (heuristic) -- experiment with # sys.getsizeof() if you disbelieve. + (50 * (len(data) - data.null_count)) # ... and then count the actual bytes of data + n_text_bytes)
def recode_or_decode_dictionary( chunked_array: pa.ChunkedArray) -> pa.ChunkedArray: """Remove unused/duplicate dictionary values from -- or cast to pa.utf8(). Workbench disallows unused/duplicate values. Call this function after filtering or modifying dictionary values: it returns a valid Workbench column given a valid Arrow column. Convert to utf8() if dictionary encoding is "bad". ("Bad" currently means, "each value is only used once;" but the meaning may change between minor versions.) Return `chunked_array` if it is already Workbench-valid and dictionary encoding is not "bad". """ if chunked_array.num_chunks == 0: return pa.chunked_array([], pa.utf8()) # if chunked_array.num_chunks != 1: # chunked_array = chunked_array.unify_dictionaries() if len(chunked_array) - chunked_array.null_count <= len( chunked_array.chunks[0].dictionary): return chunked_array.cast(pa.utf8()) dictionary = chunked_array.chunks[0].dictionary used = np.zeros(len(dictionary), dtype=bool) for chunk in chunked_array.chunks: used[pa.compute.filter(chunk.indices, pa.compute.is_valid( chunk.indices)).to_numpy()] = True if not np.all(used): # Nix unused values; then scan for dups mapping = dictionary.filter(pa.array(used, pa.bool_())).dictionary_encode() need_recode = True else: # Scan for dups mapping = dictionary.dictionary_encode() need_recode = len(mapping.dictionary) < len(dictionary) if need_recode: chunks = [_recode(chunk, mapping) for chunk in chunked_array.chunks] return pa.chunked_array(chunks) return chunked_array
def _autocast_column(data: pyarrow.ChunkedArray) -> pyarrow.ChunkedArray: """ Convert `data` to float64 or int(64|32|16|8); as fallback, return `data`. Assume `data` is of type `utf8` or a dictionary of utf8. *Implementation wart*: this may choose float64 when integers would seem a better choice, because we use Pandas and Pandas does not support nulls in integer columns. """ series: pd.Series = data.to_pandas() null = series.isnull() empty = series == "" if empty.any() and (null | empty).all(): # All-empty (and all-null) columns stay text return data try: # Try to cast to numbers number_values = pd.to_numeric(series).values except (ValueError, TypeError): return data # pd.to_numeric("") gives np.nan. We want None. Use from_pandas=True. number_array = pyarrow.array(number_values, from_pandas=True) numbers = pyarrow.chunked_array([number_array]) # Downcast integers, when possible. # # We even downcast float to int. Workbench semantics say a Number is a # Number; so we might as well store it efficiently. try: # Shrink as far as we can, until pyarrow complains. # # pyarrow will error "Floating point value truncated" if a conversion # from float to int would be lossy. # # We'll return the last _successful_ `numbers` result. numbers = numbers.cast(pyarrow.int32()) numbers = numbers.cast(pyarrow.int16()) numbers = numbers.cast(pyarrow.int8()) except pyarrow.ArrowInvalid: pass return numbers
def _read_pylist(column: pyarrow.ChunkedArray) -> List[Any]: dtype = column.type pylist = column.to_pylist() if pyarrow.types.is_timestamp(dtype) and dtype.unit == "ns": # pyarrow returns timestamps as pandas.Timestamp values (because # that has higher resolution than datetime.datetime). But we want # datetime.datetime. We'll truncate to microseconds. # # If someone complains, then we should change our API to pass int64 # instead of datetime.datetime. pylist = [None if v is None else v.to_pydatetime() for v in pylist] elif pyarrow.types.is_floating(dtype): # Pandas does not differentiate between NaN and None; so in effect, # neither do we. Numeric tables can have NaN and never None; # timestamp and String columns can have None and never NaT; int # columns cannot have NaN or None. nan = float("nan") pylist = [nan if v is None else v for v in pylist] return pylist
def _arrow_array_to_json_list(array: pyarrow.ChunkedArray) -> List[Any]: """ Convert `array` to a JSON-encodable List. Strings become Strings; Numbers become int/float; Datetimes become ISO8601-encoded Strings. """ if isinstance(array.type, pyarrow.TimestampType): multiplier = 1.0 / TimestampUnits[array.type.unit] return [ ( None if v is pyarrow.NULL else ( datetime.datetime.utcfromtimestamp(v.value * multiplier).isoformat() + "Z" ) ) for v in array ] else: return array.to_pylist()
def _startof(column: pa.ChunkedArray, unit: str) -> StartofColumnResult: factor = pa.scalar(_NS_PER_UNIT[unit], pa.int64()) timestamp_ints = column.cast(pa.int64()) # In two's complement, truncation rounds _up_. Subtract before truncating. # # In decimal, if we're truncating to the nearest 10: # # 0 => 0 # -1 => -10 # -9 => -10 # -10 => -10 # -11 => -20 # # ... rule is: subtract 9 from all negative numbers, then truncate. negative = pa.compute.less(timestamp_ints, pa.scalar(0, pa.int64())) # "offset": -9 for negatives, 0 for others offset = pa.compute.multiply( negative.cast(pa.int64()), pa.scalar(-1 * _NS_PER_UNIT[unit] + 1, pa.int64())) # to_truncate may overflow; in that case, to_truncate > timestamp_ints to_truncate = pa.compute.add(timestamp_ints, offset) truncated = pa.compute.multiply(pa.compute.divide(to_truncate, factor), factor) # Mask of [True, None, True, True, None] safe_or_null = pa.compute.or_kleene( pa.compute.less_equal(to_truncate, timestamp_ints), pa.scalar(None, pa.bool_())) truncated_or_null = truncated.filter(safe_or_null, null_selection_behavior="emit_null") return StartofColumnResult( column=truncated_or_null.cast(pa.timestamp("ns")), truncated=(truncated_or_null.null_count > column.null_count), )
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append(_text_cat(a[offset:offset + len(chunk)], chunk)) return pa.chunked_array(new_chunks)