def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowSpanType to a SpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType :return: SpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ArrowSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME) ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME) # Zero-copy convert arrays to numpy begins = begins_array.to_numpy() ends = ends_array.to_numpy() return SpanArray(target_text, begins, ends)
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowSpanType to a SpanArray. ..NOTE: Only supported with PyArrow >= 2.0.0 :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType :return: SpanArray """ if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise NotImplementedError( "Arrow serialization for SpanArray is not supported with " "PyArrow versions < 2.0.0") if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) # NOTE: workaround for bug in parquet reading if pa.types.is_struct(extension_array.type): index_dtype = extension_array.field(ArrowSpanType.BEGINS_NAME).type target_text_dict_dtype = extension_array.field( ArrowSpanType.TARGET_TEXT_DICT_NAME).type extension_array = pa.ExtensionArray.from_storage( ArrowSpanType(index_dtype, target_text_dict_dtype), extension_array) assert pa.types.is_struct(extension_array.storage.type) # Create target text StringTable and text_ids from dictionary array target_text_dict_array = extension_array.storage.field( ArrowSpanType.TARGET_TEXT_DICT_NAME) table_texts = target_text_dict_array.dictionary.to_pylist() string_table = StringTable.from_things(table_texts) text_ids = target_text_dict_array.indices.to_numpy() # Get the begins/ends pyarrow arrays begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME) ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME) # Zero-copy convert arrays to numpy begins = begins_array.to_numpy() ends = ends_array.to_numpy() return SpanArray((string_table, text_ids), begins, ends)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError("Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME) # Check if CharSpans have been split num_char_span_splits = extension_array.type.num_char_span_splits if num_char_span_splits > 0: char_begins_splits = [] char_ends_splits = [] for i in range(num_char_span_splits): char_begins_splits.append( extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i))) char_ends_splits.append( extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i))) char_begins_array = pa.concat_arrays(char_begins_splits) char_ends_array = pa.concat_arrays(char_ends_splits) else: char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME) char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME) # Remove any trailing padding if char_begins_array.null_count > 0: char_begins_array = char_begins_array[:-char_begins_array.null_count] char_ends_array = char_ends_array[:-char_ends_array.null_count] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() char_begins = char_begins_array.to_numpy() char_ends = char_ends_array.to_numpy() # Create the CharSpanArray, then the TokenSpanArray char_span = CharSpanArray(target_text, char_begins, char_ends) return TokenSpanArray(char_span, token_begins, token_ends)
def arrow_to_tensor_array(extension_array: pa.ExtensionArray) -> TensorArray: """ Convert a pyarrow.ExtensionArray with type ArrowTensorType to a TensorArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTensorType :return: TensorArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: # TODO: look into removing concat and constructing from list w/ shape values = np.concatenate( [chunk.to_numpy() for chunk in extension_array.iterchunks()]) else: values = extension_array.chunk(0).to_numpy() else: values = extension_array.to_numpy() return TensorArray(values)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise NotImplementedError( "Arrow serialization for TokenSpanArray is not supported with " "PyArrow versions < 2.0.0") if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field( ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field( ArrowTokenSpanType.ENDS_NAME) # Get the tokens as a dictionary array where indices map to a list of ArrowSpanArrays tokens_dict_array = extension_array.storage.field( ArrowTokenSpanType.TOKENS_NAME) tokens_indices = tokens_dict_array.indices arrow_tokens_arrays_array = tokens_dict_array.dictionary # Breakup the list of ArrowSpanArrays and convert back to individual SpanArrays tokens_arrays = [] span_type = None for i in range(1, len(arrow_tokens_arrays_array.offsets)): start = arrow_tokens_arrays_array.offsets[i - 1].as_py() stop = arrow_tokens_arrays_array.offsets[i].as_py() arrow_tokens_array = arrow_tokens_arrays_array.values[start:stop] # Make an instance of ArrowSpanType if span_type is None: begins_array = arrow_tokens_array.field(ArrowSpanType.BEGINS_NAME) target_text_dict_array = arrow_tokens_array.field( ArrowSpanType.TARGET_TEXT_DICT_NAME) span_type = ArrowSpanType(begins_array.type, target_text_dict_array.type) # Re-make the Arrow extension type to convert back to a SpanArray tokens_array = arrow_to_span( pa.ExtensionArray.from_storage(span_type, arrow_tokens_array)) tokens_arrays.append(tokens_array) # Map the token indices to the actual token SpanArray for each element in the TokenSpanArray tokens = [ _EMPTY_SPAN_ARRAY_SINGLETON if i is None else tokens_arrays[i] for i in tokens_indices.to_pylist() ] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() return TokenSpanArray(tokens, token_begins, token_ends)