Beispiel #1
0
def _maybe_dictionary_encode_column(
        data: pyarrow.ChunkedArray) -> pyarrow.ChunkedArray:
    if data.null_count == len(data):
        return data

    if data.chunk(0).offset > 0:
        # https://issues.apache.org/jira/browse/ARROW-7266#
        assert len(data.chunks) == 1
        data_copy = pyarrow.chunked_array(
            [pyarrow.serialize(data.chunk(0)).deserialize()])
        encoded = data_copy.dictionary_encode()
    else:
        encoded = data.dictionary_encode()

    new_cost = _string_array_pylist_n_bytes(encoded.chunk(0).dictionary)

    if new_cost > settings.MAX_DICTIONARY_PYLIST_N_BYTES:
        # abort! abort! dictionary is too large
        return data

    old_cost = _string_array_pylist_n_bytes(data.chunk(0))

    if old_cost / new_cost >= settings.MIN_DICTIONARY_COMPRESSION_RATIO_PYLIST_N_BYTES:
        return encoded
    else:
        return data
Beispiel #2
0
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray:
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

    new_chunks: List[pa.Array] = []
    for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
        a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
        b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
        new_chunks.append(_text_cat(a_slice, b_slice))
    return pa.chunked_array(new_chunks)
Beispiel #3
0
def _in_chunk_offsets(arr: pa.ChunkedArray,
                      offsets: List[int]) -> List[Tuple[int, int, int]]:
    """Calculate the access ranges for a given list of offsets.

    All chunk start indices must be included as offsets and the offsets must be
    unique.

    Returns a list of tuples that contain:
     * The index of the given chunk
     * The position inside the chunk
     * The length of the current range
    """
    new_offsets = []
    pos = 0
    chunk = 0
    chunk_pos = 0
    for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]):
        diff = offset - pos
        chunk_remains = len(arr.chunk(chunk)) - chunk_pos
        step = offset_next - offset
        if diff == 0:  # The first offset
            new_offsets.append((chunk, chunk_pos, step))
        elif diff == chunk_remains:
            chunk += 1
            chunk_pos = 0
            pos += chunk_remains
            new_offsets.append((chunk, chunk_pos, step))
        else:  # diff < chunk_remains
            chunk_pos += diff
            pos += diff
            new_offsets.append((chunk, chunk_pos, step))
    return new_offsets
Beispiel #4
0
def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] +
                                           a_offset[2]]
            b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] +
                                           b_offset[2]]
            new_chunks.append(
                dispatch_chunked_binary_map(a_slice, b_slice, ops))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops))
        return pa.chunked_array(new_chunks)
    else:
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(chunk,
                                            b[offset:offset + len(chunk)],
                                            ops))
        return pa.chunked_array(new_chunks)
Beispiel #5
0
def _1(a: pa.ChunkedArray, b: Any, op: Callable):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] +
                                           a_offset[2]]
            b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] +
                                           b_offset[2]]
            new_chunks.append(np_ufunc_op(a_slice, b_slice, op))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(np_ufunc_op(chunk, b, op))
        return pa.chunked_array(new_chunks)
    else:
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
            new_chunks.append(
                np_ufunc_op(chunk, b[offset:offset + len(chunk)], op))
        return pa.chunked_array(new_chunks)