Beispiel #1
def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] +
            b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] +
                dispatch_chunked_binary_map(a_slice, b_slice, ops))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops))
        return pa.chunked_array(new_chunks)
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
                                            b[offset:offset + len(chunk)],
        return pa.chunked_array(new_chunks)
Beispiel #2
def _1(a: pa.ChunkedArray, b: Any, op: Callable):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] +
            b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] +
            new_chunks.append(np_ufunc_op(a_slice, b_slice, op))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(np_ufunc_op(chunk, b, op))
        return pa.chunked_array(new_chunks)
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
                np_ufunc_op(chunk, b[offset:offset + len(chunk)], op))
        return pa.chunked_array(new_chunks)
Beispiel #3
def _maybe_dictionary_encode_column(
        data: pyarrow.ChunkedArray) -> pyarrow.ChunkedArray:
    if data.null_count == len(data):
        return data

    if data.chunk(0).offset > 0:
        assert len(data.chunks) == 1
        data_copy = pyarrow.chunked_array(
        encoded = data_copy.dictionary_encode()
        encoded = data.dictionary_encode()

    new_cost = _string_array_pylist_n_bytes(encoded.chunk(0).dictionary)

    if new_cost > settings.MAX_DICTIONARY_PYLIST_N_BYTES:
        # abort! abort! dictionary is too large
        return data

    old_cost = _string_array_pylist_n_bytes(data.chunk(0))

    if old_cost / new_cost >= settings.MIN_DICTIONARY_COMPRESSION_RATIO_PYLIST_N_BYTES:
        return encoded
        return data
Beispiel #4
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray:
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

    new_chunks: List[pa.Array] = []
    for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
        a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
        b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
        new_chunks.append(_text_cat(a_slice, b_slice))
    return pa.chunked_array(new_chunks)
Beispiel #5
def _in_chunk_offsets(arr: pa.ChunkedArray,
                      offsets: List[int]) -> List[Tuple[int, int, int]]:
    """Calculate the access ranges for a given list of offsets.

    All chunk start indices must be included as offsets and the offsets must be

    Returns a list of tuples that contain:
     * The index of the given chunk
     * The position inside the chunk
     * The length of the current range
    new_offsets = []
    pos = 0
    chunk = 0
    chunk_pos = 0
    for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]):
        diff = offset - pos
        chunk_remains = len(arr.chunk(chunk)) - chunk_pos
        step = offset_next - offset
        if diff == 0:  # The first offset
            new_offsets.append((chunk, chunk_pos, step))
        elif diff == chunk_remains:
            chunk += 1
            chunk_pos = 0
            pos += chunk_remains
            new_offsets.append((chunk, chunk_pos, step))
        else:  # diff < chunk_remains
            chunk_pos += diff
            pos += diff
            new_offsets.append((chunk, chunk_pos, step))
    return new_offsets
Beispiel #6
def _text_cat_chunked_mixed(a: pa.ChunkedArray,
                            b: pa.Array) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(a)
    for chunk, offset in zip(a.iterchunks(), offsets):
        new_chunks.append(_text_cat(chunk, b[offset:offset + len(chunk)]))
    return pa.chunked_array(new_chunks)
Beispiel #7
def _calculate_chunk_offsets(chunked_array: pa.ChunkedArray) -> np.ndarray:
    """Return an array holding the indices pointing to the first element of each chunk."""
    offset = 0
    offsets = []
    for chunk in chunked_array.iterchunks():
        offset += len(chunk)
    return np.array(offsets)
Beispiel #8
 def cast_for_truediv(arrow_array: pa.ChunkedArray,
                      pa_object: pa.Array | pa.Scalar) -> pa.ChunkedArray:
     # Ensure int / int -> float mirroring Python/Numpy behavior
     # as pc.divide_checked(int, int) -> int
     if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
         return arrow_array.cast(pa.float64())
     return arrow_array
Beispiel #9
def _timestamp_is_rounded(column: pa.ChunkedArray,
                          granularity: DateGranularity) -> bool:
    factor = {
        DateGranularity.SECOND: 1_000_000_000,
        DateGranularity.MINUTE: 1_000_000_000 * 60,
        DateGranularity.HOUR: 1_000_000_000 * 60 * 60,
    ints = column.cast(pa.int64())
    return pa.compute.all(
            ints, pa.compute.multiply(pa.compute.divide(ints, factor),
Beispiel #10
def _string_array_pylist_n_bytes(data: pyarrow.ChunkedArray) -> int:
    text_buf = data.buffers()[-1]
    if text_buf is None:
        # All values are ""
        n_text_bytes = 0
        n_text_bytes = text_buf.size

    return (
        # 8 bytes per value (each value is a 64-bit pointer)
        (8 * len(data))
        # 50 bytes of overhead per string (heuristic) -- experiment with
        # sys.getsizeof() if you disbelieve.
        + (50 * (len(data) - data.null_count))
        # ... and then count the actual bytes of data
        + n_text_bytes)
Beispiel #11
def recode_or_decode_dictionary(
        chunked_array: pa.ChunkedArray) -> pa.ChunkedArray:
    """Remove unused/duplicate dictionary values from -- or cast to pa.utf8().

    Workbench disallows unused/duplicate values. Call this function after
    filtering or modifying dictionary values: it returns a valid Workbench
    column given a valid Arrow column.

    Convert to utf8() if dictionary encoding is "bad". ("Bad" currently means,
    "each value is only used once;" but the meaning may change between minor

    Return `chunked_array` if it is already Workbench-valid and dictionary
    encoding is not "bad".
    if chunked_array.num_chunks == 0:
        return pa.chunked_array([], pa.utf8())

    # if chunked_array.num_chunks != 1:
    #     chunked_array = chunked_array.unify_dictionaries()

    if len(chunked_array) - chunked_array.null_count <= len(
        return chunked_array.cast(pa.utf8())

    dictionary = chunked_array.chunks[0].dictionary

    used = np.zeros(len(dictionary), dtype=bool)
    for chunk in chunked_array.chunks:
        used[pa.compute.filter(chunk.indices, pa.compute.is_valid(
            chunk.indices)).to_numpy()] = True

    if not np.all(used):
        # Nix unused values; then scan for dups
        mapping = dictionary.filter(pa.array(used,
        need_recode = True
        # Scan for dups
        mapping = dictionary.dictionary_encode()
        need_recode = len(mapping.dictionary) < len(dictionary)

    if need_recode:
        chunks = [_recode(chunk, mapping) for chunk in chunked_array.chunks]
        return pa.chunked_array(chunks)

    return chunked_array
Beispiel #12
def _autocast_column(data: pyarrow.ChunkedArray) -> pyarrow.ChunkedArray:
    Convert `data` to float64 or int(64|32|16|8); as fallback, return `data`.

    Assume `data` is of type `utf8` or a dictionary of utf8.

    *Implementation wart*: this may choose float64 when integers would seem a
    better choice, because we use Pandas and Pandas does not support nulls
    in integer columns.
    series: pd.Series = data.to_pandas()
    null = series.isnull()
    empty = series == ""
    if empty.any() and (null | empty).all():
        # All-empty (and all-null) columns stay text
        return data
        # Try to cast to numbers
        number_values = pd.to_numeric(series).values
    except (ValueError, TypeError):
        return data

    # pd.to_numeric("") gives np.nan. We want None. Use from_pandas=True.
    number_array = pyarrow.array(number_values, from_pandas=True)
    numbers = pyarrow.chunked_array([number_array])

    # Downcast integers, when possible.
    # We even downcast float to int. Workbench semantics say a Number is a
    # Number; so we might as well store it efficiently.
        # Shrink as far as we can, until pyarrow complains.
        # pyarrow will error "Floating point value truncated" if a conversion
        # from float to int would be lossy.
        # We'll return the last _successful_ `numbers` result.
        numbers = numbers.cast(pyarrow.int32())
        numbers = numbers.cast(pyarrow.int16())
        numbers = numbers.cast(pyarrow.int8())
    except pyarrow.ArrowInvalid:

    return numbers
Beispiel #13
def _read_pylist(column: pyarrow.ChunkedArray) -> List[Any]:
    dtype = column.type

    pylist = column.to_pylist()
    if pyarrow.types.is_timestamp(dtype) and dtype.unit == "ns":
        # pyarrow returns timestamps as pandas.Timestamp values (because
        # that has higher resolution than datetime.datetime). But we want
        # datetime.datetime. We'll truncate to microseconds.
        # If someone complains, then we should change our API to pass int64
        # instead of datetime.datetime.
        pylist = [None if v is None else v.to_pydatetime() for v in pylist]
    elif pyarrow.types.is_floating(dtype):
        # Pandas does not differentiate between NaN and None; so in effect,
        # neither do we. Numeric tables can have NaN and never None;
        # timestamp and String columns can have None and never NaT; int
        # columns cannot have NaN or None.
        nan = float("nan")
        pylist = [nan if v is None else v for v in pylist]
    return pylist
Beispiel #14
def _arrow_array_to_json_list(array: pyarrow.ChunkedArray) -> List[Any]:
    Convert `array` to a JSON-encodable List.

    Strings become Strings; Numbers become int/float; Datetimes become
    ISO8601-encoded Strings.
    if isinstance(array.type, pyarrow.TimestampType):
        multiplier = 1.0 / TimestampUnits[array.type.unit]
        return [
                if v is pyarrow.NULL
                else (
                    datetime.datetime.utcfromtimestamp(v.value * multiplier).isoformat()
                    + "Z"
            for v in array
        return array.to_pylist()
def _startof(column: pa.ChunkedArray, unit: str) -> StartofColumnResult:
    factor = pa.scalar(_NS_PER_UNIT[unit], pa.int64())
    timestamp_ints = column.cast(pa.int64())

    # In two's complement, truncation rounds _up_. Subtract before truncating.
    # In decimal, if we're truncating to the nearest 10:
    # 0 => 0
    # -1 => -10
    # -9 => -10
    # -10 => -10
    # -11 => -20
    # ... rule is: subtract 9 from all negative numbers, then truncate.

    negative = pa.compute.less(timestamp_ints, pa.scalar(0, pa.int64()))
    # "offset": -9 for negatives, 0 for others
    offset = pa.compute.multiply(
        pa.scalar(-1 * _NS_PER_UNIT[unit] + 1, pa.int64()))
    # to_truncate may overflow; in that case, to_truncate > timestamp_ints
    to_truncate = pa.compute.add(timestamp_ints, offset)
    truncated = pa.compute.multiply(pa.compute.divide(to_truncate, factor),

    # Mask of [True, None, True, True, None]
    safe_or_null = pa.compute.or_kleene(
        pa.compute.less_equal(to_truncate, timestamp_ints),
        pa.scalar(None, pa.bool_()))

    truncated_or_null = truncated.filter(safe_or_null,

    return StartofColumnResult(
        truncated=(truncated_or_null.null_count > column.null_count),
Beispiel #16
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(b)
    for chunk, offset in zip(b.iterchunks(), offsets):
        new_chunks.append(_text_cat(a[offset:offset + len(chunk)], chunk))
    return pa.chunked_array(new_chunks)