def make_char_span(location_col, text_col, original_text): """ Convert a column of begin, end pairs to a SpanArray. :param location_col: Arrow array containing (begin, end) tuples of character offset :param text_col: Arrow array of strings that should match the target texts of the spans in location_col. Used for reconstructing target text when it is not provided. :param original_text: Target text for the spans. Optional. If not provided, this function will reconstruct target text from the contents of `text_col`. """ # Replace location columns with char and token spans if not (pa.types.is_list(location_col.type) and pa.types.is_primitive(location_col.type.value_type)): raise ValueError("Expected location column as a list of integers") # TODO: assert location is fixed with 2 elements? if isinstance(location_col, pa.ChunkedArray): location_col = pa.concat_arrays(location_col.iterchunks()) # Flatten to get primitive array convertible to numpy array = location_col.flatten() values = array.to_numpy() begins = values[0::2] ends = values[1::2] if original_text is None: warnings.warn( "Analyzed text was not provided, attempting to reconstruct from tokens, " "however it will not be identical to the original analyzed text.") if isinstance(text_col, pa.ChunkedArray): text_col = pa.concat_arrays(text_col.iterchunks()) original_text = build_original_text(text_col, begins) return SpanArray(original_text, begins, ends)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError("Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME) # Check if CharSpans have been split num_char_span_splits = extension_array.type.num_char_span_splits if num_char_span_splits > 0: char_begins_splits = [] char_ends_splits = [] for i in range(num_char_span_splits): char_begins_splits.append( extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i))) char_ends_splits.append( extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i))) char_begins_array = pa.concat_arrays(char_begins_splits) char_ends_array = pa.concat_arrays(char_ends_splits) else: char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME) char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME) # Remove any trailing padding if char_begins_array.null_count > 0: char_begins_array = char_begins_array[:-char_begins_array.null_count] char_ends_array = char_ends_array[:-char_ends_array.null_count] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() char_begins = char_begins_array.to_numpy() char_ends = char_ends_array.to_numpy() # Create the CharSpanArray, then the TokenSpanArray char_span = CharSpanArray(target_text, char_begins, char_ends) return TokenSpanArray(char_span, token_begins, token_ends)
def take(self, indices, filtered=True): gs = self.trim() if gs.filtered and filtered: # we translate the indices that refer to filters row indices to # indices of the unfiltered row indices indices = np.asarray(indices) gs._df.count() # make sure the mask is filled max_index = indices.max() mask = gs._df._selection_masks['__filter__'] filtered_indices = mask.first(max_index + 1) indices = filtered_indices[indices] if len(indices) == 0: return GeoSeries(geometry=pa.array([]), crs=gs._crs) if isinstance(gs._geometry, pa.ChunkedArray): offset = 0 chunks = [] for chunk in gs._geometry.chunks: size = len(chunk) chunk_indices = [ x for x in indices if offset <= x < size + offset ] chunk_indices = pa.array([x - offset for x in chunk_indices]) if len(chunk_indices) > 0: chunks.append(chunk.take(chunk_indices)) offset += size if len(chunks) > 0: geometry = pa.concat_arrays(chunks) else: raise IndexError('ERROR: Out of range') elif isinstance(gs._geometry, pa.Array): indices = pa.array(indices) geometry = gs._geometry.take(indices) else: geometry = gs._geometry.take(indices) return GeoSeries(geometry=geometry, crs=gs._crs)
def __init__(self, expression, values, keep_other=True, other_value=None, sort=False, label=None, df=None): self.df = df or expression.df self.sort = sort self.pre_sort = True self.expression = self.df[str(expression)] self.label = label or self.expression._label self.keep_other = keep_other if isinstance(values, pa.ChunkedArray): values = pa.concat_arrays(values.chunks) if sort: indices = pa.compute.sort_indices(values) values = pa.compute.take(values, indices) if self.keep_other: self.bin_values = pa.array(vaex.array_types.tolist(values) + [other_value]) self.values = self.bin_values.slice(0, len(self.bin_values) - 1) else: raise NotImplementedError("not supported yet") # although we can support this, it will fail with _combine, because of # the mapping of the set to -1 self.bin_values = pa.array(vaex.array_types.tolist(values)) self.values = self.bin_values self.N = len(self.bin_values) fp = vaex.cache.fingerprint(values) fingerprint = f"set-grouper-fixed-{fp}" self.hash_map_unique = vaex.hash.HashMapUnique.from_keys(self.values, fingerprint=fingerprint) self.basename = "hash_map_unique_%s" % vaex.utils._python_save_name(str(self.expression) + "_" + self.hash_map_unique.fingerprint) self.sort_indices = None self._promise = vaex.promise.Promise.fulfilled(None)
def _concat_same_type(cls, to_concat): return cls( pa.concat_arrays( [ea.data for ea in to_concat] ), dtype=to_concat[0].dtype )
def token_span_to_arrow(token_span: TokenSpanArray) -> pa.ExtensionArray: """ Convert a TokenSpanArray to a pyarrow.ExtensionArray with a type of ArrowTokenSpanType and struct as the storage type. The resulting extension array can be serialized and transferred with standard Arrow protocols. :param token_span: A TokenSpanArray to be converted :return: pyarrow.ExtensionArray containing TokenSpan data """ if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise NotImplementedError( "Arrow serialization for TokenSpanArray is not supported with " "PyArrow versions < 2.0.0") # Create arrays for begins/ends token_begins_array = pa.array(token_span.begin_token) token_ends_array = pa.array(token_span.end_token) # Filter out any empty SpanArrays non_null_tokens = token_span.tokens[~token_span.isna()] assert len(non_null_tokens) > 0 # Get either single document as a list or use a list of all if multiple docs if all([token is non_null_tokens[0] for token in non_null_tokens]): tokens_arrays = [non_null_tokens[0]] tokens_indices = pa.array([0] * len(token_span.tokens), mask=token_span.isna()) else: raise NotImplementedError( "TokenSpan Multi-doc serialization not yet implemented due to " "ArrowNotImplementedError: Concat with dictionary unification NYI") tokens_arrays = non_null_tokens tokens_indices = np.zeros_like(token_span.tokens) tokens_indices[~token_span.isna()] = range(len(tokens_arrays)) tokens_indices = pa.array(tokens_indices, mask=token_span.isna()) # Convert each token SpanArray to Arrow and get as raw storage arrow_tokens_arrays = [span_to_arrow(sa).storage for sa in tokens_arrays] # Create a list array with each element is an ArrowSpanArray # TODO: pyarrow.lib.ArrowNotImplementedError: ('Sequence converter for type dictionary<values=string, indices=int8, ordered=0> not implemented', 'Conversion failed for column ts1 with type TokenSpanDtype') #arrow_tokens_arrays_array = pa.array(arrow_tokens_arrays, type=pa.list_(arrow_tokens_arrays[0].type)) offsets = [0] + [len(a) for a in arrow_tokens_arrays] values = pa.concat_arrays( arrow_tokens_arrays) # TODO: can't concat extension arrays? arrow_tokens_arrays_array = pa.ListArray.from_arrays(offsets, values) # Create a dictionary array mapping each token SpanArray index used to the list of ArrowSpanArrays tokens_dict_array = pa.DictionaryArray.from_arrays( tokens_indices, arrow_tokens_arrays_array) typ = ArrowTokenSpanType(token_begins_array.type, tokens_dict_array.type) fields = list(typ.storage_type) storage = pa.StructArray.from_arrays( [token_begins_array, token_ends_array, tokens_dict_array], fields=fields) return pa.ExtensionArray.from_storage(typ, storage)
def test_arrow_array_modifies_data(self, test_array_chunked_nulls): # noqa: F811 expected = pa.concat_arrays(test_array_chunked_nulls.data.iterchunks()) id1 = id(test_array_chunked_nulls.data) real = test_array_chunked_nulls.__arrow_array__() assert id1 != id(test_array_chunked_nulls.data) assert real.equals(expected)
def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]: """ Returns all records as a list of ArrowRecordBatches, pyarrow must be installed and available on driver and worker Python environments. This is an experimental feature. :param split_batches: split batches such that each column is in its own allocation, so that the selfDestruct optimization is effective; default False. .. note:: Experimental. """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) with SCCallSiteSync(self._sc): ( port, auth_secret, jsocket_auth_server, ) = self._jdf.collectAsArrowToPython() # Collect list of un-ordered batches where last element is a list of correct order indices try: batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer()) if split_batches: # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure # each column in each record batch is contained in its own allocation. # Otherwise, selfDestruct does nothing; it frees each column as its # converted, but each column will actually be a list of slices of record # batches, and so no memory is actually freed until all columns are # converted. import pyarrow as pa results = [] for batch_or_indices in batch_stream: if isinstance(batch_or_indices, pa.RecordBatch): batch_or_indices = pa.RecordBatch.from_arrays( [ # This call actually reallocates the array pa.concat_arrays([array]) for array in batch_or_indices ], schema=batch_or_indices.schema, ) results.append(batch_or_indices) else: results = list(batch_stream) finally: # Join serving thread and raise any exceptions from collectAsArrowToPython jsocket_auth_server.getResult() # Separate RecordBatches from batch order indices in results batches = results[:-1] batch_order = results[-1] # Re-order the batch list using the correct order return [batches[i] for i in batch_order]
def _set_single_index_in_chunk(chunk: pa.Array, index: int, value: Any) -> pa.Array: """Set a single position in a pyarrow array.""" assert is_scalar(value) return pa.concat_arrays([ chunk[:index], pa.array([value], type=pa.string()), chunk[index + 1:], ])
def test_group_by(df): udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()]) df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) batches = df.collect() arrays = [batch.column(1) for batch in batches] joined = pa.concat_arrays(arrays) assert joined == pa.array([1.0 + 2.0, 3.0])
def pd_nanop(nanop: Callable, arr: Union[pa.ChunkedArray, pa.Array], skipna: bool): """Use pandas.core.nanops to provide a reduction.""" if isinstance(arr, pa.ChunkedArray): data = pa.concat_arrays(arr.iterchunks()) else: data = arr np_arr = _extract_data_buffer_as_np_array(data) mask = extract_isnull_bytemap(data) return nanop(np_arr, skipna=skipna, mask=mask)
def _replace_with_indices( cls, chunk: pa.Array, indices: npt.NDArray[np.intp], value: npt.NDArray[Any], ) -> pa.Array: """ Replace items selected with a set of positional indices. Analogous to pyarrow.compute.replace_with_mask, except that replacement positions are identified via indices rather than a mask. Parameters ---------- chunk : pa.Array indices : npt.NDArray[np.intp] value : npt.NDArray[Any] Replacement value(s). Returns ------- pa.Array """ n = len(indices) if n == 0: return chunk start, stop = indices[[0, -1]] if (stop - start) == (n - 1): # fast path for a contiguous set of indices arrays = [ chunk[:start], pa.array(value, type=chunk.type, from_pandas=True), chunk[stop + 1:], ] arrays = [arr for arr in arrays if len(arr)] if len(arrays) == 1: return arrays[0] return pa.concat_arrays(arrays) mask = np.zeros(len(chunk), dtype=np.bool_) mask[indices] = True if pa_version_under5p0: arr = chunk.to_numpy(zero_copy_only=False) arr[mask] = value return pa.array(arr, type=chunk.type) if isna(value).all(): return pc.if_else(mask, None, chunk) return pc.replace_with_mask(chunk, mask, value)
def trim_offsets(offset, length, null_buffer, offsets_buffer, large=False): if offset == 0: return null_buffer, offsets_buffer if large: offsets = np.frombuffer(offsets_buffer, np.int64, length + 1 + offset) else: offsets = np.frombuffer(offsets_buffer, np.int32, length + 1 + offset) nulls = pa.BooleanArray.from_buffers(pa.bool_(), length, [None, null_buffer], offset=offset) nulls = pa.concat_arrays([nulls]) assert nulls.offset == 0 assert len(nulls) == length offsets = offsets[offset:] - offsets[offset] return nulls.buffers()[1], pa.py_buffer(offsets)
def test_pa_array(self, array_inhom_chunks): npt.assert_array_equal(array_inhom_chunks.offsets, [0, 3, 8]) expected = pa.concat_arrays(array_inhom_chunks.data.iterchunks()) real = pa.array(array_inhom_chunks) assert isinstance(real, pa.Array) assert real.equals(expected) if pa.__version__ < "0.15": npt.assert_array_equal(array_inhom_chunks.offsets, [0, 3, 8]) else: npt.assert_array_equal(array_inhom_chunks.offsets, [0])
def __init__(self, array, dtype=None, copy=None): # Choose default dtype for empty arrays try: if len(array) == 0 and dtype is None: dtype = 'float64' except: # len failed pass # See if we can determine arrow array type if isinstance(dtype, GeometryDtype): # Use arrow type as-is arrow_dtype = dtype.arrow_dtype elif isinstance(dtype, pa.DataType): arrow_dtype = dtype elif dtype is not None and dtype != np.dtype('object'): # Scalar element dtype arrow_dtype = self._arrow_type_from_numpy_element_dtype(dtype) else: # Let arrow infer type arrow_dtype = None # Unwrap GeometryList elements to numpy arrays if is_array_like(array) or isinstance(array, list): array = [_unwrap_geometry(el, self._element_type) for el in array] array = pa.array(array, type=arrow_dtype) elif isinstance(array, pa.Array): # Nothing to do pass elif isinstance(array, pa.ChunkedArray): array = pa.concat_arrays(array.chunks) else: raise ValueError( "Unsupported type passed for {}: {}".format( self.__class__.__name__, type(array) ) ) # Save off pyarrow array self.data = array # Compute types np_type = self._numpy_element_dtype_from_arrow_type(self.data.type) self._numpy_element_type = np.dtype(np_type) self._dtype = self._dtype_class(np_type) # Initialize backing property for spatial index self._sindex = None
def _get_field(struct_array: pa.StructArray, field: Union[str, int]) -> pa.Array: """Returns struct_array.field(field) with null propagation. This function is equivalent to struct_array.field() but correctly handles null propagation (the parent struct's null values are propagated to children). Args: struct_array: A struct array which should be queried. field: The request field to retrieve. Returns: A pa.Array containing the requested field. Raises: KeyError: If field is not a child field in struct_array. """ child_array = struct_array.field(field) # In case all values are present then there's no need for special handling. # We can return child_array as is to avoid a performance penalty caused by # constructing and flattening the returned array. if struct_array.null_count == 0: return child_array # is_valid returns a BooleanArray with two buffers the buffer at offset # 0 is always None and buffer 1 contains the data on which fields are # valid/not valid. # (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout) validity_bitmap_buffer = struct_array.is_valid().buffers()[1] # Construct a new struct array with a single field. Calling flatten() on the # new array guarantees validity bitmaps are merged correctly. new_type = pa.struct([pa.field(field, child_array.type)]) if (child_array.null_count == 0 and child_array.offset != 0): # TODO(https://issues.apache.org/jira/browse/ARROW-14156): Remove this # special handling once flattening a struct that has children that were # sliced produces arrays with a correct validity bitmap. child_array = pa.concat_arrays( [pa.nulls(0, child_array.type), child_array]) filtered_struct = pa.StructArray.from_buffers( new_type, len(struct_array), [validity_bitmap_buffer], null_count=struct_array.null_count, children=[child_array]) return filtered_struct.flatten()[0]
def _concatenate_extension_column(ca: "pyarrow.ChunkedArray") -> "pyarrow.Array": """Concatenate chunks of an extension column into a contiguous array. This concatenation is required for creating copies and for .take() to work on extension arrays. See https://issues.apache.org/jira/browse/ARROW-16503. """ if not _is_column_extension_type(ca): raise ValueError("Chunked array isn't an extension array: {ca}") if ca.num_chunks == 0: # No-op for no-chunk chunked arrays, since there's nothing to concatenate. return ca chunk = ca.chunk(0) return type(chunk).from_storage( chunk.type, pyarrow.concat_arrays([c.storage for c in ca.chunks]) )
def _copy_table(table: "pyarrow.Table") -> "pyarrow.Table": """Copy the provided Arrow table.""" import pyarrow as pa # Copy the table by copying each column and constructing a new table with # the same schema. cols = table.columns new_cols = [] for col in cols: if col.num_chunks > 0 and isinstance(col.chunk(0), pa.ExtensionArray): # If an extension array, we copy the underlying storage arrays. chunk = col.chunk(0) arr = type(chunk).from_storage( chunk.type, pa.concat_arrays([c.storage for c in col.chunks])) else: # Otherwise, we copy the top-level chunk arrays. arr = col.combine_chunks() new_cols.append(arr) return pa.Table.from_arrays(new_cols, schema=table.schema)
def __arrow_array__(self, type=None): # type: (pa.DataType,) -> pa.Array """ Implement pyarrow array interface (requires pyarrow>=0.15). Returns ------- pa.Array """ if self._has_single_chunk: data = self.data.chunks[0] else: data = pa.concat_arrays(self.data.iterchunks()) self.data = pa.chunked_array([data ]) # modify a data pointer inplace if type is not None and type != data.type: return data.cast(type, safe=False) else: return data
def take(self, indices): lz = self.copy() if isinstance(lz._obj, ChunkedArray): offset = 0 chunks = [] for chunk in lz._obj.chunks: size = len(chunk) chunk_indices = [x for x in indices if offset <= x < size + offset] chunk_indices = array([x - offset for x in chunk_indices]) if len(chunk_indices) > 0: chunks.append(chunk.take(chunk_indices)) offset += size if len(chunks) > 0: obj = concat_arrays(chunks) else: raise IndexError('ERROR: Out of range') else: indices = array(indices) obj = lz._obj.take(indices) lz._obj = obj return lz
def take(self, indices): lz = self.copy() if isinstance(lz._obj, ChunkedArray): offset = 0 chunks = [] for chunk in lz._obj.chunks: size = len(chunk) chunk_indices = list( filter(lambda x: offset <= x < size + offset, indices)) chunk_indices = array(map(lambda x: x - offset, chunk_indices)) if len(chunk_indices) > 0: chunks.append(chunk.take(chunk_indices)) offset += size if len(chunks) > 0: obj = concat_arrays(chunks) else: raise IndexError('ERROR: Out of range') else: indices = array(indices) obj = lz._obj.take(indices) lz._obj = obj return lz
def _take_on_chunks(self, indices, limits_idx, cum_lengths, sort_idx=None): def take_in_one_chunk(i_chunk): indices_chunk = indices[limits_idx[i_chunk]:limits_idx[i_chunk + 1]] indices_chunk -= cum_lengths[i_chunk] if (self.dtype.is_list and self.data.chunk(i_chunk).flatten().null_count == 0 and self.data.chunk(i_chunk).null_count == 0 and self.flatten().dtype._is_numeric): return take_indices_on_pyarrow_list(self.data.chunk(i_chunk), indices_chunk) else: return self.data.chunk(i_chunk).take(pa.array(indices_chunk)) # this is a pyarrow.Array result = [take_in_one_chunk(i) for i in range(self.data.num_chunks)] # we know that self.data.num_chunks >1 if sort_idx is None: return FletcherArray( pa.chunked_array(filter(len, result), type=self.data.type)) else: return FletcherArray( pa.concat_arrays(result).take(pa.array(sort_idx)))
def assert_content_equals_array(result, expected): """Assert that the result is an Arrow structure and the content matches an array.""" assert isinstance(result, (pa.Array, pa.ChunkedArray)) if isinstance(result, pa.ChunkedArray): result = pa.concat_arrays(result.iterchunks()) assert result.equals(expected)
def _make_entity_dataframes( entities: List, original_text: str) -> (pd.DataFrame, pd.DataFrame): """ Create the entities and entity_mentions DataFrames. :param entities: The "entities" section of a parsed NLU response :param original_text: Text of the document. This argument must be provided if there are entity mention spans. """ if len(entities) == 0: return pd.DataFrame(), pd.DataFrame() table = util.make_table(entities) # Check if response includes entity mentions mention_name_cols = [(name, table.column(name)) for name in table.column_names if name.lower().startswith("mentions")] # Make entities and entity mentions (optional) DataFrames if len(mention_name_cols) > 0: mention_names, mention_cols = zip(*mention_name_cols) # Create the entities DataFrame with mention arrays dropped table = table.drop(mention_names) pdf = table.to_pandas() # Flatten the mention arrays to be put in separate table mention_arrays = [ pa.concat_arrays(col.iterchunks()) for col in mention_cols ] flat_mention_arrays = [a.flatten() for a in mention_arrays] table_mentions = pa.Table.from_arrays(flat_mention_arrays, names=mention_names) # Convert location/text columns to span location_col, location_name = util.find_column(table_mentions, "location") text_col, text_name = util.find_column(table_mentions, "text") if original_text is None: raise ValueError( "Unable to construct target text for converting entity mentions to spans" ) char_span = util.make_char_span(location_col, text_col, original_text) table_mentions = table_mentions.drop([location_name, text_name]) # Create the entity_mentions DataFrame pdf_mentions = table_mentions.to_pandas() pdf_mentions["span"] = char_span # Align index of parent entities DataFrame with flattened DataFrame and ffill # values mention_offsets = mention_arrays[0].offsets.to_numpy() pdf_parent = pdf.set_index(mention_offsets[:-1]) pdf_parent = pdf_parent.reindex(pdf_mentions.index, method="ffill") # Add columns from entities parent DataFrame pdf_mentions["text"] = pdf_parent["text"] pdf_mentions["type"] = pdf_parent["type"] # Remove "mentions" from column names pdf_mentions.rename(columns={ c: c.split("mentions.")[-1] for c in pdf_mentions.columns }, inplace=True) else: pdf = table.to_pandas() pdf_mentions = pd.DataFrame() return pdf, pdf_mentions
def _make_relations_dataframe_zero_copy(relations): if len(relations) == 0: return pd.DataFrame() table = util.make_table(relations) # Separate each argument into a column flattened_arguments = [] drop_cols = [] for name in table.column_names: if name.lower().startswith("arguments"): col = pa.concat_arrays(table.column(name).iterchunks()) assert pa.types.is_list(col.type) is_nested_list = pa.types.is_list(col.type.value_type) name_split = name.split('.', maxsplit=1) first_list = col[0] num_arguments = len(first_list) null_count = 0 # Get the flattened raw values raw = col offset_arrays = [] while pa.types.is_list(raw.type): offset_arrays.append(raw.offsets) null_count += raw.null_count raw = raw.flatten() # TODO handle lists with null values if null_count > 0: continue # Convert values to numpy values = raw.to_numpy(zero_copy_only=False) # string might copy offsets_list = [o.to_numpy() for o in offset_arrays] # Compute the length of each list in the array value_offsets = offsets_list.pop() value_lengths = value_offsets[1:] - value_offsets[:-1] # Separate the arguments into individual columns for i in range(num_arguments): arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1]) arg_lengths = value_lengths[i::num_arguments] # Fixed length arrays can be sliced if not is_nested_list or len(np.unique(arg_lengths)) == 1: num_elements = len(first_list[i]) if is_nested_list else 1 # Only 1 element so leave in primitive array if not is_nested_list or num_elements == 1: arg_values = values[i::num_arguments] arg_array = pa.array(arg_values) # Multiple elements so put back in a list array else: arg_values = values.reshape( [len(col) * num_arguments, num_elements]) arg_values = arg_values[i::num_elements] arg_values = arg_values.flatten() arg_offsets = np.cumsum(arg_lengths) arg_offsets = np.insert(arg_offsets, 0, 0) arg_array = pa.ListArray.from_arrays( arg_offsets, arg_values) else: # TODO Argument properties with variable length arrays not currently # supported continue flattened_arguments.append((arg_array, arg_name)) drop_cols.append(name) # Add the flattened argument columns for arg_array, arg_name in flattened_arguments: table = table.append_column(arg_name, arg_array) # Drop columns that have been flattened table = table.drop(drop_cols) return table.to_pandas()
def _make_relations_dataframe(relations, original_text, sentence_span_series): if len(relations) == 0: return pd.DataFrame() table = util.make_table(relations) location_cols = { } # Type: Dict[int, Tuple[Union[Array, ChunkedArray], str]] # Separate each argument into a column flattened_arguments = [] drop_cols = [] for name in table.column_names: if name.lower().startswith("arguments"): col = pa.concat_arrays(table.column(name).iterchunks()) assert pa.types.is_list(col.type) name_split = name.split('.', maxsplit=1) num_arguments = len(col[0]) value_series = col.values.to_pandas() # Separate the arguments into individual columns for i in range(num_arguments): arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1]) arg_series = value_series[i::num_arguments] arg_array = pa.array(arg_series) # If list array is fixed length with 1 element, it can be flattened temp = arg_array while pa.types.is_list(temp.type): temp = temp.flatten() if len(temp) == len(arg_array): # TODO also need to verify each offset inc by 1? arg_array = temp if name.lower().endswith("location"): location_cols[i] = (arg_array, "{}.{}".format(name_split[0], i)) flattened_arguments.append((arg_array, arg_name)) drop_cols.append(name) # Add the flattened argument columns for arg_array, arg_name in flattened_arguments: table = table.append_column(arg_name, arg_array) # Replace argument location and text columns with spans arg_span_cols = {} for arg_i, (location_col, arg_prefix) in location_cols.items(): text_col, text_name = util.find_column(table, "{}.text".format(arg_prefix)) arg_span_cols["{}.span".format(arg_prefix)] = util.make_char_span( location_col, text_col, original_text) drop_cols.extend(["{}.location".format(arg_prefix), text_name]) add_cols = arg_span_cols.copy() # Build the sentence span and drop plain text sentence col sentence_col, sentence_name = util.find_column(table, "sentence") arg_col_names = list(arg_span_cols.keys()) if len(arg_col_names) > 0: first_arg_span_array = arg_span_cols[arg_col_names[0]] sentence_matches = [] for i, arg_span in enumerate(first_arg_span_array): arg_begin = arg_span.begin arg_end = arg_span.end j = len(sentence_span_series) // 2 found = False while not found: sentence_span = sentence_span_series[j] if arg_begin >= sentence_span.end: j += 1 elif arg_end <= sentence_span.begin: j -= 1 else: contains = [ sentence_span.contains(a[i]) for a in arg_span_cols.values() ] if not (all(contains) and sentence_span.covered_text == sentence_col[i].as_py()): msg = f"Mismatched sentence span for: {sentence_span}" if not all(contains): msg += f"\nContains Args: {all(contains)}" if sentence_span.covered_text != sentence_col[i].as_py( ): msg += f"\nSpanText: '{sentence_span.covered_text}'" \ f"\nSentence: '{sentence_col[i]}'" warnings.warn(msg) sentence_matches.append(j) found = True relations_sentence = sentence_span_series[sentence_matches] add_cols["sentence_span"] = relations_sentence.reset_index(drop=True) drop_cols.append(sentence_name) else: warnings.warn("Could not make sentence span column for Re") # Drop columns that have been flattened or replaced by spans table = table.drop(drop_cols) df = table.to_pandas() # Insert additional columns for col_name, col in add_cols.items(): df[col_name] = col return df
def test_concat_array_different_types(): with pytest.raises(pa.ArrowInvalid): pa.concat_arrays([pa.array([1]), pa.array([2.])])
def test_concat_array(): concatenated = pa.concat_arrays([pa.array([1, 2]), pa.array([3, 4])]) assert concatenated.equals(pa.array([1, 2, 3, 4]))
def list_of_pa_arrays_to_pyarrow_listarray( l_arr: List[pa.Array]) -> pa.ListArray: offsets = pa.array(np.cumsum([0] + [len(arr) for arr in l_arr]), type=pa.int32()) values = pa.concat_arrays(l_arr) return pa.ListArray.from_arrays(offsets, values)
def __init__(self, expression, values, keep_other=True, other_value=None, sort=False, label=None, df=None): self.df = df or expression.df self.sort = sort self.pre_sort = True self.expression = self.df[str(expression)] self.label = label or self.expression._label self.keep_other = keep_other if isinstance(values, pa.ChunkedArray): values = pa.concat_arrays(values.chunks) if sort: indices = pa.compute.sort_indices(values) values = pa.compute.take(values, indices) if self.keep_other: self.bin_values = pa.array( vaex.array_types.tolist(values) + [other_value]) self.values = self.bin_values.slice(0, len(self.bin_values) - 1) else: raise NotImplementedError("not supported yet") # although we can support this, it will fail with _combine, because of # the mapping of the set to -1 self.bin_values = pa.array(vaex.array_types.tolist(values)) self.values = self.bin_values self.N = len(self.bin_values) dtype = vaex.dtype_of(self.values) set_type = vaex.hash.ordered_set_type_from_dtype(dtype) values_list = self.values.tolist() try: null_value = values_list.index(None) null_count = 1 except ValueError: null_value = -1 null_count = 0 if vaex.dtype_of(self.values) == float: nancount = np.isnan(self.values).sum() else: nancount = 0 fp = vaex.cache.fingerprint(values) fingerprint = f"set-grouper-fixed-{fp}" if dtype.is_string: values = vaex.column.ColumnStringArrow.from_arrow(self.values) string_sequence = values.string_sequence self.set = set_type(string_sequence, null_value, nancount, null_count, fingerprint) else: self.set = set_type(self.values, null_value, nancount, null_count, fingerprint) self.basename = "set_%s" % vaex.utils._python_save_name( str(self.expression) + "_" + self.set.fingerprint) self.binby_expression = expression self.sort_indices = None self._promise = vaex.promise.Promise.fulfilled(None)