def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowSpanType to
    a SpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType
    :return: SpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return SpanArray(target_text, begins, ends)
Esempio n. 2
0
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowSpanType to
    a SpanArray.

    ..NOTE: Only supported with PyArrow >= 2.0.0

    :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType
    :return: SpanArray
    """
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for SpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    # NOTE: workaround for bug in parquet reading
    if pa.types.is_struct(extension_array.type):
        index_dtype = extension_array.field(ArrowSpanType.BEGINS_NAME).type
        target_text_dict_dtype = extension_array.field(
            ArrowSpanType.TARGET_TEXT_DICT_NAME).type
        extension_array = pa.ExtensionArray.from_storage(
            ArrowSpanType(index_dtype, target_text_dict_dtype),
            extension_array)

    assert pa.types.is_struct(extension_array.storage.type)

    # Create target text StringTable and text_ids from dictionary array
    target_text_dict_array = extension_array.storage.field(
        ArrowSpanType.TARGET_TEXT_DICT_NAME)
    table_texts = target_text_dict_array.dictionary.to_pylist()
    string_table = StringTable.from_things(table_texts)
    text_ids = target_text_dict_array.indices.to_numpy()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return SpanArray((string_table, text_ids), begins, ends)
Esempio n. 3
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError("Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME)

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
            char_begins_splits.append(
                extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i)))
            char_ends_splits.append(
                extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i)))
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
    else:
        char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME)
        char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME)

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the CharSpanArray, then the TokenSpanArray
    char_span = CharSpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
def arrow_to_tensor_array(extension_array: pa.ExtensionArray) -> TensorArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTensorType to a
    TensorArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTensorType
    :return: TensorArray
    """

    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            # TODO: look into removing concat and constructing from list w/ shape
            values = np.concatenate(
                [chunk.to_numpy() for chunk in extension_array.iterchunks()])
        else:
            values = extension_array.chunk(0).to_numpy()
    else:
        values = extension_array.to_numpy()

    return TensorArray(values)
Esempio n. 5
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for TokenSpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(
        ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(
        ArrowTokenSpanType.ENDS_NAME)

    # Get the tokens as a dictionary array where indices map to a list of ArrowSpanArrays
    tokens_dict_array = extension_array.storage.field(
        ArrowTokenSpanType.TOKENS_NAME)
    tokens_indices = tokens_dict_array.indices
    arrow_tokens_arrays_array = tokens_dict_array.dictionary

    # Breakup the list of ArrowSpanArrays and convert back to individual SpanArrays
    tokens_arrays = []
    span_type = None
    for i in range(1, len(arrow_tokens_arrays_array.offsets)):
        start = arrow_tokens_arrays_array.offsets[i - 1].as_py()
        stop = arrow_tokens_arrays_array.offsets[i].as_py()
        arrow_tokens_array = arrow_tokens_arrays_array.values[start:stop]

        # Make an instance of ArrowSpanType
        if span_type is None:
            begins_array = arrow_tokens_array.field(ArrowSpanType.BEGINS_NAME)
            target_text_dict_array = arrow_tokens_array.field(
                ArrowSpanType.TARGET_TEXT_DICT_NAME)
            span_type = ArrowSpanType(begins_array.type,
                                      target_text_dict_array.type)

        # Re-make the Arrow extension type to convert back to a SpanArray
        tokens_array = arrow_to_span(
            pa.ExtensionArray.from_storage(span_type, arrow_tokens_array))
        tokens_arrays.append(tokens_array)

    # Map the token indices to the actual token SpanArray for each element in the TokenSpanArray
    tokens = [
        _EMPTY_SPAN_ARRAY_SINGLETON if i is None else tokens_arrays[i]
        for i in tokens_indices.to_pylist()
    ]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()

    return TokenSpanArray(tokens, token_begins, token_ends)