def _maybe_dictionary_encode_column( data: pyarrow.ChunkedArray) -> pyarrow.ChunkedArray: if data.null_count == len(data): return data if data.chunk(0).offset > 0: # https://issues.apache.org/jira/browse/ARROW-7266# assert len(data.chunks) == 1 data_copy = pyarrow.chunked_array( [pyarrow.serialize(data.chunk(0)).deserialize()]) encoded = data_copy.dictionary_encode() else: encoded = data.dictionary_encode() new_cost = _string_array_pylist_n_bytes(encoded.chunk(0).dictionary) if new_cost > settings.MAX_DICTIONARY_PYLIST_N_BYTES: # abort! abort! dictionary is too large return data old_cost = _string_array_pylist_n_bytes(data.chunk(0)) if old_cost / new_cost >= settings.MIN_DICTIONARY_COMPRESSION_RATIO_PYLIST_N_BYTES: return encoded else: return data
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray: in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]] new_chunks.append(_text_cat(a_slice, b_slice)) return pa.chunked_array(new_chunks)
def _in_chunk_offsets(arr: pa.ChunkedArray, offsets: List[int]) -> List[Tuple[int, int, int]]: """Calculate the access ranges for a given list of offsets. All chunk start indices must be included as offsets and the offsets must be unique. Returns a list of tuples that contain: * The index of the given chunk * The position inside the chunk * The length of the current range """ new_offsets = [] pos = 0 chunk = 0 chunk_pos = 0 for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]): diff = offset - pos chunk_remains = len(arr.chunk(chunk)) - chunk_pos step = offset_next - offset if diff == 0: # The first offset new_offsets.append((chunk, chunk_pos, step)) elif diff == chunk_remains: chunk += 1 chunk_pos = 0 pos += chunk_remains new_offsets.append((chunk, chunk_pos, step)) else: # diff < chunk_remains chunk_pos += diff pos += diff new_offsets.append((chunk, chunk_pos, step)) return new_offsets
def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] + b_offset[2]] new_chunks.append( dispatch_chunked_binary_map(a_slice, b_slice, ops)) return pa.chunked_array(new_chunks) elif np.isscalar(b): new_chunks = [] for chunk in a.iterchunks(): new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops)) return pa.chunked_array(new_chunks) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(chunk, b[offset:offset + len(chunk)], ops)) return pa.chunked_array(new_chunks)
def _1(a: pa.ChunkedArray, b: Any, op: Callable): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] + b_offset[2]] new_chunks.append(np_ufunc_op(a_slice, b_slice, op)) return pa.chunked_array(new_chunks) elif np.isscalar(b): new_chunks = [] for chunk in a.iterchunks(): new_chunks.append(np_ufunc_op(chunk, b, op)) return pa.chunked_array(new_chunks) else: new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( np_ufunc_op(chunk, b[offset:offset + len(chunk)], op)) return pa.chunked_array(new_chunks)