Example #1
0
def make_char_span(location_col, text_col, original_text):
    """
    Convert a column of begin, end pairs to a SpanArray.

    :param location_col: Arrow array containing (begin, end) tuples of character offset
    :param text_col: Arrow array of strings that should match the target texts of the
      spans in location_col. Used for reconstructing target text when it is not provided.
    :param original_text: Target text for the spans. Optional. If not provided, this
     function will reconstruct target text from the contents of `text_col`.
    """

    # Replace location columns with char and token spans
    if not (pa.types.is_list(location_col.type)
            and pa.types.is_primitive(location_col.type.value_type)):
        raise ValueError("Expected location column as a list of integers")

    # TODO: assert location is fixed with 2 elements?
    if isinstance(location_col, pa.ChunkedArray):
        location_col = pa.concat_arrays(location_col.iterchunks())

    # Flatten to get primitive array convertible to numpy
    array = location_col.flatten()
    values = array.to_numpy()
    begins = values[0::2]
    ends = values[1::2]

    if original_text is None:
        warnings.warn(
            "Analyzed text was not provided, attempting to reconstruct from tokens, "
            "however it will not be identical to the original analyzed text.")
        if isinstance(text_col, pa.ChunkedArray):
            text_col = pa.concat_arrays(text_col.iterchunks())
        original_text = build_original_text(text_col, begins)

    return SpanArray(original_text, begins, ends)
Example #2
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError("Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME)

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
            char_begins_splits.append(
                extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i)))
            char_ends_splits.append(
                extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i)))
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
    else:
        char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME)
        char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME)

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the CharSpanArray, then the TokenSpanArray
    char_span = CharSpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
Example #3
0
 def take(self, indices, filtered=True):
     gs = self.trim()
     if gs.filtered and filtered:
         # we translate the indices that refer to filters row indices to
         # indices of the unfiltered row indices
         indices = np.asarray(indices)
         gs._df.count()  # make sure the mask is filled
         max_index = indices.max()
         mask = gs._df._selection_masks['__filter__']
         filtered_indices = mask.first(max_index + 1)
         indices = filtered_indices[indices]
     if len(indices) == 0:
         return GeoSeries(geometry=pa.array([]), crs=gs._crs)
     if isinstance(gs._geometry, pa.ChunkedArray):
         offset = 0
         chunks = []
         for chunk in gs._geometry.chunks:
             size = len(chunk)
             chunk_indices = [
                 x for x in indices if offset <= x < size + offset
             ]
             chunk_indices = pa.array([x - offset for x in chunk_indices])
             if len(chunk_indices) > 0:
                 chunks.append(chunk.take(chunk_indices))
             offset += size
         if len(chunks) > 0:
             geometry = pa.concat_arrays(chunks)
         else:
             raise IndexError('ERROR: Out of range')
     elif isinstance(gs._geometry, pa.Array):
         indices = pa.array(indices)
         geometry = gs._geometry.take(indices)
     else:
         geometry = gs._geometry.take(indices)
     return GeoSeries(geometry=geometry, crs=gs._crs)
Example #4
0
    def __init__(self, expression, values, keep_other=True, other_value=None, sort=False, label=None, df=None):
        self.df = df or expression.df
        self.sort = sort
        self.pre_sort = True
        self.expression = self.df[str(expression)]
        self.label = label or self.expression._label
        self.keep_other = keep_other
        if isinstance(values, pa.ChunkedArray):
            values = pa.concat_arrays(values.chunks)
        if sort:
            indices = pa.compute.sort_indices(values)
            values = pa.compute.take(values, indices)

        if self.keep_other:
            self.bin_values = pa.array(vaex.array_types.tolist(values) + [other_value])
            self.values = self.bin_values.slice(0, len(self.bin_values) - 1)
        else:
            raise NotImplementedError("not supported yet")
            # although we can support this, it will fail with _combine, because of
            # the mapping of the set to -1
            self.bin_values = pa.array(vaex.array_types.tolist(values))
            self.values = self.bin_values
        self.N = len(self.bin_values)
        fp = vaex.cache.fingerprint(values)
        fingerprint = f"set-grouper-fixed-{fp}"
        self.hash_map_unique = vaex.hash.HashMapUnique.from_keys(self.values, fingerprint=fingerprint)

        self.basename = "hash_map_unique_%s" % vaex.utils._python_save_name(str(self.expression) + "_" + self.hash_map_unique.fingerprint)
        self.sort_indices = None
        self._promise = vaex.promise.Promise.fulfilled(None)
Example #5
0
 def _concat_same_type(cls, to_concat):
     return cls(
         pa.concat_arrays(
             [ea.data for ea in to_concat]
         ),
         dtype=to_concat[0].dtype
     )
Example #6
0
def token_span_to_arrow(token_span: TokenSpanArray) -> pa.ExtensionArray:
    """
    Convert a TokenSpanArray to a pyarrow.ExtensionArray with a type
    of ArrowTokenSpanType and struct as the storage type. The resulting
    extension array can be serialized and transferred with standard
    Arrow protocols.

    :param token_span: A TokenSpanArray to be converted
    :return: pyarrow.ExtensionArray containing TokenSpan data
    """
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for TokenSpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    # Create arrays for begins/ends
    token_begins_array = pa.array(token_span.begin_token)
    token_ends_array = pa.array(token_span.end_token)

    # Filter out any empty SpanArrays
    non_null_tokens = token_span.tokens[~token_span.isna()]
    assert len(non_null_tokens) > 0

    # Get either single document as a list or use a list of all if multiple docs
    if all([token is non_null_tokens[0] for token in non_null_tokens]):
        tokens_arrays = [non_null_tokens[0]]
        tokens_indices = pa.array([0] * len(token_span.tokens),
                                  mask=token_span.isna())
    else:
        raise NotImplementedError(
            "TokenSpan Multi-doc serialization not yet implemented due to "
            "ArrowNotImplementedError: Concat with dictionary unification NYI")
        tokens_arrays = non_null_tokens
        tokens_indices = np.zeros_like(token_span.tokens)
        tokens_indices[~token_span.isna()] = range(len(tokens_arrays))
        tokens_indices = pa.array(tokens_indices, mask=token_span.isna())

    # Convert each token SpanArray to Arrow and get as raw storage
    arrow_tokens_arrays = [span_to_arrow(sa).storage for sa in tokens_arrays]

    # Create a list array with each element is an ArrowSpanArray
    # TODO: pyarrow.lib.ArrowNotImplementedError: ('Sequence converter for type dictionary<values=string, indices=int8, ordered=0> not implemented', 'Conversion failed for column ts1 with type TokenSpanDtype')
    #arrow_tokens_arrays_array = pa.array(arrow_tokens_arrays, type=pa.list_(arrow_tokens_arrays[0].type))
    offsets = [0] + [len(a) for a in arrow_tokens_arrays]
    values = pa.concat_arrays(
        arrow_tokens_arrays)  # TODO: can't concat extension arrays?
    arrow_tokens_arrays_array = pa.ListArray.from_arrays(offsets, values)

    # Create a dictionary array mapping each token SpanArray index used to the list of ArrowSpanArrays
    tokens_dict_array = pa.DictionaryArray.from_arrays(
        tokens_indices, arrow_tokens_arrays_array)

    typ = ArrowTokenSpanType(token_begins_array.type, tokens_dict_array.type)
    fields = list(typ.storage_type)

    storage = pa.StructArray.from_arrays(
        [token_begins_array, token_ends_array, tokens_dict_array],
        fields=fields)

    return pa.ExtensionArray.from_storage(typ, storage)
Example #7
0
    def test_arrow_array_modifies_data(self,
                                       test_array_chunked_nulls):  # noqa: F811
        expected = pa.concat_arrays(test_array_chunked_nulls.data.iterchunks())
        id1 = id(test_array_chunked_nulls.data)
        real = test_array_chunked_nulls.__arrow_array__()

        assert id1 != id(test_array_chunked_nulls.data)
        assert real.equals(expected)
Example #8
0
    def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]:
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.
        This is an experimental feature.

        :param split_batches: split batches such that each column is in its own allocation, so
            that the selfDestruct optimization is effective; default False.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            (
                port,
                auth_secret,
                jsocket_auth_server,
            ) = self._jdf.collectAsArrowToPython()

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer())
            if split_batches:
                # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure
                # each column in each record batch is contained in its own allocation.
                # Otherwise, selfDestruct does nothing; it frees each column as its
                # converted, but each column will actually be a list of slices of record
                # batches, and so no memory is actually freed until all columns are
                # converted.
                import pyarrow as pa

                results = []
                for batch_or_indices in batch_stream:
                    if isinstance(batch_or_indices, pa.RecordBatch):
                        batch_or_indices = pa.RecordBatch.from_arrays(
                            [
                                # This call actually reallocates the array
                                pa.concat_arrays([array])
                                for array in batch_or_indices
                            ],
                            schema=batch_or_indices.schema,
                        )
                    results.append(batch_or_indices)
            else:
                results = list(batch_stream)
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Example #9
0
 def _set_single_index_in_chunk(chunk: pa.Array, index: int,
                                value: Any) -> pa.Array:
     """Set a single position in a pyarrow array."""
     assert is_scalar(value)
     return pa.concat_arrays([
         chunk[:index],
         pa.array([value], type=pa.string()),
         chunk[index + 1:],
     ])
Example #10
0
def test_group_by(df):
    udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()])

    df = df.aggregate([f.col("b")], [udaf(f.col("a"))])

    batches = df.collect()
    arrays = [batch.column(1) for batch in batches]
    joined = pa.concat_arrays(arrays)
    assert joined == pa.array([1.0 + 2.0, 3.0])
Example #11
0
def pd_nanop(nanop: Callable, arr: Union[pa.ChunkedArray, pa.Array],
             skipna: bool):
    """Use pandas.core.nanops to provide a reduction."""
    if isinstance(arr, pa.ChunkedArray):
        data = pa.concat_arrays(arr.iterchunks())
    else:
        data = arr
    np_arr = _extract_data_buffer_as_np_array(data)
    mask = extract_isnull_bytemap(data)

    return nanop(np_arr, skipna=skipna, mask=mask)
Example #12
0
    def _replace_with_indices(
        cls,
        chunk: pa.Array,
        indices: npt.NDArray[np.intp],
        value: npt.NDArray[Any],
    ) -> pa.Array:
        """
        Replace items selected with a set of positional indices.

        Analogous to pyarrow.compute.replace_with_mask, except that replacement
        positions are identified via indices rather than a mask.

        Parameters
        ----------
        chunk : pa.Array
        indices : npt.NDArray[np.intp]
        value : npt.NDArray[Any]
            Replacement value(s).

        Returns
        -------
        pa.Array
        """
        n = len(indices)

        if n == 0:
            return chunk

        start, stop = indices[[0, -1]]

        if (stop - start) == (n - 1):
            # fast path for a contiguous set of indices
            arrays = [
                chunk[:start],
                pa.array(value, type=chunk.type, from_pandas=True),
                chunk[stop + 1:],
            ]
            arrays = [arr for arr in arrays if len(arr)]
            if len(arrays) == 1:
                return arrays[0]
            return pa.concat_arrays(arrays)

        mask = np.zeros(len(chunk), dtype=np.bool_)
        mask[indices] = True

        if pa_version_under5p0:
            arr = chunk.to_numpy(zero_copy_only=False)
            arr[mask] = value
            return pa.array(arr, type=chunk.type)

        if isna(value).all():
            return pc.if_else(mask, None, chunk)

        return pc.replace_with_mask(chunk, mask, value)
Example #13
0
def trim_offsets(offset, length, null_buffer, offsets_buffer, large=False):
    if offset == 0:
        return null_buffer, offsets_buffer
    if large:
        offsets = np.frombuffer(offsets_buffer, np.int64, length + 1 + offset)
    else:
        offsets = np.frombuffer(offsets_buffer, np.int32, length + 1 + offset)
    nulls = pa.BooleanArray.from_buffers(pa.bool_(), length, [None, null_buffer], offset=offset)
    nulls = pa.concat_arrays([nulls])
    assert nulls.offset == 0
    assert len(nulls) == length
    offsets = offsets[offset:] - offsets[offset]
    return nulls.buffers()[1], pa.py_buffer(offsets)
Example #14
0
    def test_pa_array(self, array_inhom_chunks):
        npt.assert_array_equal(array_inhom_chunks.offsets, [0, 3, 8])

        expected = pa.concat_arrays(array_inhom_chunks.data.iterchunks())
        real = pa.array(array_inhom_chunks)
        assert isinstance(real, pa.Array)

        assert real.equals(expected)

        if pa.__version__ < "0.15":
            npt.assert_array_equal(array_inhom_chunks.offsets, [0, 3, 8])
        else:
            npt.assert_array_equal(array_inhom_chunks.offsets, [0])
Example #15
0
    def __init__(self, array, dtype=None, copy=None):
        # Choose default dtype for empty arrays
        try:
            if len(array) == 0 and dtype is None:
                dtype = 'float64'
        except:
            # len failed
            pass

        # See if we can determine arrow array type
        if isinstance(dtype, GeometryDtype):
            # Use arrow type as-is
            arrow_dtype = dtype.arrow_dtype
        elif isinstance(dtype, pa.DataType):
            arrow_dtype = dtype
        elif dtype is not None and dtype != np.dtype('object'):
            # Scalar element dtype
            arrow_dtype = self._arrow_type_from_numpy_element_dtype(dtype)
        else:
            # Let arrow infer type
            arrow_dtype = None

        # Unwrap GeometryList elements to numpy arrays
        if is_array_like(array) or isinstance(array, list):
            array = [_unwrap_geometry(el, self._element_type) for el in array]
            array = pa.array(array, type=arrow_dtype)
        elif isinstance(array, pa.Array):
            # Nothing to do
            pass
        elif isinstance(array, pa.ChunkedArray):
            array = pa.concat_arrays(array.chunks)
        else:
            raise ValueError(
                "Unsupported type passed for {}: {}".format(
                    self.__class__.__name__, type(array)
                )
            )

        # Save off pyarrow array
        self.data = array

        # Compute types
        np_type = self._numpy_element_dtype_from_arrow_type(self.data.type)
        self._numpy_element_type = np.dtype(np_type)
        self._dtype = self._dtype_class(np_type)

        # Initialize backing property for spatial index
        self._sindex = None
Example #16
0
def _get_field(struct_array: pa.StructArray, field: Union[str,
                                                          int]) -> pa.Array:
    """Returns struct_array.field(field) with null propagation.

  This function is equivalent to struct_array.field() but correctly handles
  null propagation (the parent struct's null values are propagated to children).

  Args:
    struct_array: A struct array which should be queried.
    field: The request field to retrieve.

  Returns:
    A pa.Array containing the requested field.

  Raises:
    KeyError: If field is not a child field in struct_array.
  """
    child_array = struct_array.field(field)

    # In case all values are present then there's no need for special handling.
    # We can return child_array as is to avoid a performance penalty caused by
    # constructing and flattening the returned array.
    if struct_array.null_count == 0:
        return child_array
    # is_valid returns a BooleanArray with two buffers the buffer at offset
    # 0 is always None and buffer 1 contains the data on which fields are
    # valid/not valid.
    # (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
    validity_bitmap_buffer = struct_array.is_valid().buffers()[1]

    # Construct a new struct array with a single field.  Calling flatten() on the
    # new array guarantees validity bitmaps are merged correctly.
    new_type = pa.struct([pa.field(field, child_array.type)])
    if (child_array.null_count == 0 and child_array.offset != 0):
        # TODO(https://issues.apache.org/jira/browse/ARROW-14156): Remove this
        # special handling once flattening a struct that has children that were
        # sliced produces arrays with a correct validity bitmap.
        child_array = pa.concat_arrays(
            [pa.nulls(0, child_array.type), child_array])
    filtered_struct = pa.StructArray.from_buffers(
        new_type,
        len(struct_array), [validity_bitmap_buffer],
        null_count=struct_array.null_count,
        children=[child_array])
    return filtered_struct.flatten()[0]
Example #17
0
def _concatenate_extension_column(ca: "pyarrow.ChunkedArray") -> "pyarrow.Array":
    """Concatenate chunks of an extension column into a contiguous array.

    This concatenation is required for creating copies and for .take() to work on
    extension arrays.
    See https://issues.apache.org/jira/browse/ARROW-16503.
    """
    if not _is_column_extension_type(ca):
        raise ValueError("Chunked array isn't an extension array: {ca}")

    if ca.num_chunks == 0:
        # No-op for no-chunk chunked arrays, since there's nothing to concatenate.
        return ca

    chunk = ca.chunk(0)
    return type(chunk).from_storage(
        chunk.type, pyarrow.concat_arrays([c.storage for c in ca.chunks])
    )
Example #18
0
def _copy_table(table: "pyarrow.Table") -> "pyarrow.Table":
    """Copy the provided Arrow table."""
    import pyarrow as pa

    # Copy the table by copying each column and constructing a new table with
    # the same schema.
    cols = table.columns
    new_cols = []
    for col in cols:
        if col.num_chunks > 0 and isinstance(col.chunk(0), pa.ExtensionArray):
            # If an extension array, we copy the underlying storage arrays.
            chunk = col.chunk(0)
            arr = type(chunk).from_storage(
                chunk.type, pa.concat_arrays([c.storage for c in col.chunks]))
        else:
            # Otherwise, we copy the top-level chunk arrays.
            arr = col.combine_chunks()
        new_cols.append(arr)
    return pa.Table.from_arrays(new_cols, schema=table.schema)
Example #19
0
    def __arrow_array__(self, type=None):
        # type: (pa.DataType,) -> pa.Array
        """
        Implement pyarrow array interface (requires pyarrow>=0.15).

        Returns
        -------
        pa.Array

        """
        if self._has_single_chunk:
            data = self.data.chunks[0]
        else:
            data = pa.concat_arrays(self.data.iterchunks())
            self.data = pa.chunked_array([data
                                          ])  # modify a data pointer inplace

        if type is not None and type != data.type:
            return data.cast(type, safe=False)
        else:
            return data
Example #20
0
 def take(self, indices):
     lz = self.copy()
     if isinstance(lz._obj, ChunkedArray):
         offset = 0
         chunks = []
         for chunk in lz._obj.chunks:
             size = len(chunk)
             chunk_indices = [x for x in indices if offset <= x < size + offset]
             chunk_indices = array([x - offset for x in chunk_indices])
             if len(chunk_indices) > 0:
                 chunks.append(chunk.take(chunk_indices))
             offset += size
         if len(chunks) > 0:
             obj = concat_arrays(chunks)
         else:
             raise IndexError('ERROR: Out of range')
     else:
         indices = array(indices)
         obj = lz._obj.take(indices)
     lz._obj = obj
     return lz
Example #21
0
 def take(self, indices):
     lz = self.copy()
     if isinstance(lz._obj, ChunkedArray):
         offset = 0
         chunks = []
         for chunk in lz._obj.chunks:
             size = len(chunk)
             chunk_indices = list(
                 filter(lambda x: offset <= x < size + offset, indices))
             chunk_indices = array(map(lambda x: x - offset, chunk_indices))
             if len(chunk_indices) > 0:
                 chunks.append(chunk.take(chunk_indices))
             offset += size
         if len(chunks) > 0:
             obj = concat_arrays(chunks)
         else:
             raise IndexError('ERROR: Out of range')
     else:
         indices = array(indices)
         obj = lz._obj.take(indices)
     lz._obj = obj
     return lz
Example #22
0
    def _take_on_chunks(self, indices, limits_idx, cum_lengths, sort_idx=None):
        def take_in_one_chunk(i_chunk):
            indices_chunk = indices[limits_idx[i_chunk]:limits_idx[i_chunk +
                                                                   1]]
            indices_chunk -= cum_lengths[i_chunk]
            if (self.dtype.is_list
                    and self.data.chunk(i_chunk).flatten().null_count == 0
                    and self.data.chunk(i_chunk).null_count == 0
                    and self.flatten().dtype._is_numeric):
                return take_indices_on_pyarrow_list(self.data.chunk(i_chunk),
                                                    indices_chunk)
            else:
                return self.data.chunk(i_chunk).take(pa.array(indices_chunk))
            # this is a pyarrow.Array

        result = [take_in_one_chunk(i) for i in range(self.data.num_chunks)]
        # we know that self.data.num_chunks >1

        if sort_idx is None:
            return FletcherArray(
                pa.chunked_array(filter(len, result), type=self.data.type))
        else:
            return FletcherArray(
                pa.concat_arrays(result).take(pa.array(sort_idx)))
Example #23
0
def assert_content_equals_array(result, expected):
    """Assert that the result is an Arrow structure and the content matches an array."""
    assert isinstance(result, (pa.Array, pa.ChunkedArray))
    if isinstance(result, pa.ChunkedArray):
        result = pa.concat_arrays(result.iterchunks())
    assert result.equals(expected)
Example #24
0
def _make_entity_dataframes(
        entities: List, original_text: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Create the entities and entity_mentions DataFrames.

    :param entities: The "entities" section of a parsed NLU response
    :param original_text: Text of the document.  This argument must be provided if there
     are entity mention spans.
    """
    if len(entities) == 0:
        return pd.DataFrame(), pd.DataFrame()

    table = util.make_table(entities)

    # Check if response includes entity mentions
    mention_name_cols = [(name, table.column(name))
                         for name in table.column_names
                         if name.lower().startswith("mentions")]

    # Make entities and entity mentions (optional) DataFrames
    if len(mention_name_cols) > 0:
        mention_names, mention_cols = zip(*mention_name_cols)

        # Create the entities DataFrame with mention arrays dropped
        table = table.drop(mention_names)
        pdf = table.to_pandas()

        # Flatten the mention arrays to be put in separate table
        mention_arrays = [
            pa.concat_arrays(col.iterchunks()) for col in mention_cols
        ]
        flat_mention_arrays = [a.flatten() for a in mention_arrays]
        table_mentions = pa.Table.from_arrays(flat_mention_arrays,
                                              names=mention_names)

        # Convert location/text columns to span
        location_col, location_name = util.find_column(table_mentions,
                                                       "location")
        text_col, text_name = util.find_column(table_mentions, "text")
        if original_text is None:
            raise ValueError(
                "Unable to construct target text for converting entity mentions to spans"
            )

        char_span = util.make_char_span(location_col, text_col, original_text)
        table_mentions = table_mentions.drop([location_name, text_name])

        # Create the entity_mentions DataFrame
        pdf_mentions = table_mentions.to_pandas()
        pdf_mentions["span"] = char_span

        # Align index of parent entities DataFrame with flattened DataFrame and ffill
        # values
        mention_offsets = mention_arrays[0].offsets.to_numpy()
        pdf_parent = pdf.set_index(mention_offsets[:-1])
        pdf_parent = pdf_parent.reindex(pdf_mentions.index, method="ffill")

        # Add columns from entities parent DataFrame
        pdf_mentions["text"] = pdf_parent["text"]
        pdf_mentions["type"] = pdf_parent["type"]

        # Remove "mentions" from column names
        pdf_mentions.rename(columns={
            c: c.split("mentions.")[-1]
            for c in pdf_mentions.columns
        },
                            inplace=True)
    else:
        pdf = table.to_pandas()
        pdf_mentions = pd.DataFrame()

    return pdf, pdf_mentions
Example #25
0
def _make_relations_dataframe_zero_copy(relations):
    if len(relations) == 0:
        return pd.DataFrame()

    table = util.make_table(relations)

    # Separate each argument into a column
    flattened_arguments = []
    drop_cols = []
    for name in table.column_names:
        if name.lower().startswith("arguments"):
            col = pa.concat_arrays(table.column(name).iterchunks())
            assert pa.types.is_list(col.type)
            is_nested_list = pa.types.is_list(col.type.value_type)

            name_split = name.split('.', maxsplit=1)
            first_list = col[0]
            num_arguments = len(first_list)

            null_count = 0

            # Get the flattened raw values
            raw = col
            offset_arrays = []
            while pa.types.is_list(raw.type):
                offset_arrays.append(raw.offsets)
                null_count += raw.null_count
                raw = raw.flatten()

            # TODO handle lists with null values
            if null_count > 0:
                continue

            # Convert values to numpy
            values = raw.to_numpy(zero_copy_only=False)  # string might copy
            offsets_list = [o.to_numpy() for o in offset_arrays]

            # Compute the length of each list in the array
            value_offsets = offsets_list.pop()
            value_lengths = value_offsets[1:] - value_offsets[:-1]

            # Separate the arguments into individual columns
            for i in range(num_arguments):
                arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1])
                arg_lengths = value_lengths[i::num_arguments]

                # Fixed length arrays can be sliced
                if not is_nested_list or len(np.unique(arg_lengths)) == 1:
                    num_elements = len(first_list[i]) if is_nested_list else 1

                    # Only 1 element so leave in primitive array
                    if not is_nested_list or num_elements == 1:
                        arg_values = values[i::num_arguments]
                        arg_array = pa.array(arg_values)
                    # Multiple elements so put back in a list array
                    else:
                        arg_values = values.reshape(
                            [len(col) * num_arguments, num_elements])
                        arg_values = arg_values[i::num_elements]
                        arg_values = arg_values.flatten()
                        arg_offsets = np.cumsum(arg_lengths)
                        arg_offsets = np.insert(arg_offsets, 0, 0)
                        arg_array = pa.ListArray.from_arrays(
                            arg_offsets, arg_values)
                else:
                    # TODO Argument properties with variable length arrays not currently
                    #  supported
                    continue

                flattened_arguments.append((arg_array, arg_name))
            drop_cols.append(name)

    # Add the flattened argument columns
    for arg_array, arg_name in flattened_arguments:
        table = table.append_column(arg_name, arg_array)

    # Drop columns that have been flattened
    table = table.drop(drop_cols)

    return table.to_pandas()
Example #26
0
def _make_relations_dataframe(relations, original_text, sentence_span_series):
    if len(relations) == 0:
        return pd.DataFrame()

    table = util.make_table(relations)

    location_cols = {
    }  # Type: Dict[int, Tuple[Union[Array, ChunkedArray], str]]

    # Separate each argument into a column
    flattened_arguments = []
    drop_cols = []
    for name in table.column_names:
        if name.lower().startswith("arguments"):
            col = pa.concat_arrays(table.column(name).iterchunks())
            assert pa.types.is_list(col.type)

            name_split = name.split('.', maxsplit=1)
            num_arguments = len(col[0])

            value_series = col.values.to_pandas()

            # Separate the arguments into individual columns
            for i in range(num_arguments):
                arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1])
                arg_series = value_series[i::num_arguments]

                arg_array = pa.array(arg_series)

                # If list array is fixed length with 1 element, it can be flattened
                temp = arg_array
                while pa.types.is_list(temp.type):
                    temp = temp.flatten()
                    if len(temp) == len(arg_array):
                        # TODO also need to verify each offset inc by 1?
                        arg_array = temp

                if name.lower().endswith("location"):
                    location_cols[i] = (arg_array,
                                        "{}.{}".format(name_split[0], i))

                flattened_arguments.append((arg_array, arg_name))
            drop_cols.append(name)

    # Add the flattened argument columns
    for arg_array, arg_name in flattened_arguments:
        table = table.append_column(arg_name, arg_array)

    # Replace argument location and text columns with spans
    arg_span_cols = {}
    for arg_i, (location_col, arg_prefix) in location_cols.items():
        text_col, text_name = util.find_column(table,
                                               "{}.text".format(arg_prefix))
        arg_span_cols["{}.span".format(arg_prefix)] = util.make_char_span(
            location_col, text_col, original_text)
        drop_cols.extend(["{}.location".format(arg_prefix), text_name])

    add_cols = arg_span_cols.copy()

    # Build the sentence span and drop plain text sentence col
    sentence_col, sentence_name = util.find_column(table, "sentence")
    arg_col_names = list(arg_span_cols.keys())
    if len(arg_col_names) > 0:
        first_arg_span_array = arg_span_cols[arg_col_names[0]]

        sentence_matches = []
        for i, arg_span in enumerate(first_arg_span_array):
            arg_begin = arg_span.begin
            arg_end = arg_span.end
            j = len(sentence_span_series) // 2
            found = False
            while not found:
                sentence_span = sentence_span_series[j]
                if arg_begin >= sentence_span.end:
                    j += 1
                elif arg_end <= sentence_span.begin:
                    j -= 1
                else:
                    contains = [
                        sentence_span.contains(a[i])
                        for a in arg_span_cols.values()
                    ]
                    if not (all(contains) and sentence_span.covered_text
                            == sentence_col[i].as_py()):
                        msg = f"Mismatched sentence span for: {sentence_span}"
                        if not all(contains):
                            msg += f"\nContains Args: {all(contains)}"
                        if sentence_span.covered_text != sentence_col[i].as_py(
                        ):
                            msg += f"\nSpanText: '{sentence_span.covered_text}'" \
                                   f"\nSentence: '{sentence_col[i]}'"
                        warnings.warn(msg)
                    sentence_matches.append(j)
                    found = True

        relations_sentence = sentence_span_series[sentence_matches]
        add_cols["sentence_span"] = relations_sentence.reset_index(drop=True)
        drop_cols.append(sentence_name)
    else:
        warnings.warn("Could not make sentence span column for Re")

    # Drop columns that have been flattened or replaced by spans
    table = table.drop(drop_cols)

    df = table.to_pandas()

    # Insert additional columns
    for col_name, col in add_cols.items():
        df[col_name] = col

    return df
Example #27
0
def test_concat_array_different_types():
    with pytest.raises(pa.ArrowInvalid):
        pa.concat_arrays([pa.array([1]), pa.array([2.])])
Example #28
0
def test_concat_array():
    concatenated = pa.concat_arrays([pa.array([1, 2]), pa.array([3, 4])])
    assert concatenated.equals(pa.array([1, 2, 3, 4]))
Example #29
0
def list_of_pa_arrays_to_pyarrow_listarray(
        l_arr: List[pa.Array]) -> pa.ListArray:
    offsets = pa.array(np.cumsum([0] + [len(arr) for arr in l_arr]),
                       type=pa.int32())
    values = pa.concat_arrays(l_arr)
    return pa.ListArray.from_arrays(offsets, values)
Example #30
0
    def __init__(self,
                 expression,
                 values,
                 keep_other=True,
                 other_value=None,
                 sort=False,
                 label=None,
                 df=None):
        self.df = df or expression.df
        self.sort = sort
        self.pre_sort = True
        self.expression = self.df[str(expression)]
        self.label = label or self.expression._label
        self.keep_other = keep_other
        if isinstance(values, pa.ChunkedArray):
            values = pa.concat_arrays(values.chunks)
        if sort:
            indices = pa.compute.sort_indices(values)
            values = pa.compute.take(values, indices)

        if self.keep_other:
            self.bin_values = pa.array(
                vaex.array_types.tolist(values) + [other_value])
            self.values = self.bin_values.slice(0, len(self.bin_values) - 1)
        else:
            raise NotImplementedError("not supported yet")
            # although we can support this, it will fail with _combine, because of
            # the mapping of the set to -1
            self.bin_values = pa.array(vaex.array_types.tolist(values))
            self.values = self.bin_values
        self.N = len(self.bin_values)
        dtype = vaex.dtype_of(self.values)
        set_type = vaex.hash.ordered_set_type_from_dtype(dtype)
        values_list = self.values.tolist()
        try:
            null_value = values_list.index(None)
            null_count = 1
        except ValueError:
            null_value = -1
            null_count = 0
        if vaex.dtype_of(self.values) == float:
            nancount = np.isnan(self.values).sum()
        else:
            nancount = 0

        fp = vaex.cache.fingerprint(values)
        fingerprint = f"set-grouper-fixed-{fp}"
        if dtype.is_string:
            values = vaex.column.ColumnStringArrow.from_arrow(self.values)
            string_sequence = values.string_sequence
            self.set = set_type(string_sequence, null_value, nancount,
                                null_count, fingerprint)
        else:
            self.set = set_type(self.values, null_value, nancount, null_count,
                                fingerprint)

        self.basename = "set_%s" % vaex.utils._python_save_name(
            str(self.expression) + "_" + self.set.fingerprint)
        self.binby_expression = expression
        self.sort_indices = None
        self._promise = vaex.promise.Promise.fulfilled(None)