Beispiel #1
0
def test_chunked_array_asarray():
    data = [
        pa.array([0]),
        pa.array([1, 2, 3])
    ]
    chunked_arr = pa.chunked_array(data)

    np_arr = np.asarray(chunked_arr)
    assert np_arr.tolist() == [0, 1, 2, 3]
    assert np_arr.dtype == np.dtype('int64')

    # An optional type can be specified when calling np.asarray
    np_arr = np.asarray(chunked_arr, dtype='str')
    assert np_arr.tolist() == ['0', '1', '2', '3']

    # Types are modified when there are nulls
    data = [
        pa.array([1, None]),
        pa.array([1, 2, 3])
    ]
    chunked_arr = pa.chunked_array(data)

    np_arr = np.asarray(chunked_arr)
    elements = np_arr.tolist()
    assert elements[0] == 1.
    assert np.isnan(elements[1])
    assert elements[2:] == [1., 2., 3.]
    assert np_arr.dtype == np.dtype('float64')
Beispiel #2
0
 def ne(xarrs, yarrs):
     if isinstance(xarrs, pa.ChunkedArray):
         x = xarrs
     else:
         x = pa.chunked_array(xarrs)
     if isinstance(yarrs, pa.ChunkedArray):
         y = yarrs
     else:
         y = pa.chunked_array(yarrs)
     assert not x.equals(y)
     assert not y.equals(x)
Beispiel #3
0
def test_table_pickle():
    data = [
        pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()),
        pa.chunked_array([["some", "strings", None, ""]], type=pa.string()),
    ]
    schema = pa.schema([pa.field('ints', pa.uint32()),
                        pa.field('strs', pa.string())],
                       metadata={b'foo': b'bar'})
    table = pa.Table.from_arrays(data, schema=schema)

    result = pickle.loads(pickle.dumps(table))
    result._validate()
    assert result.equals(table)
Beispiel #4
0
 def eq(xarrs, yarrs):
     if isinstance(xarrs, pa.ChunkedArray):
         x = xarrs
     else:
         x = pa.chunked_array(xarrs)
     if isinstance(yarrs, pa.ChunkedArray):
         y = yarrs
     else:
         y = pa.chunked_array(yarrs)
     assert x.equals(y)
     assert y.equals(x)
     assert x == y
     assert x != str(y)
Beispiel #5
0
def test_table_from_pydict():
    table = pa.Table.from_pydict({})
    assert table.num_columns == 0
    assert table.num_rows == 0
    assert table.schema == pa.schema([])
    assert table.to_pydict() == {}

    # With arrays as values
    data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])),
                        ('floats', pa.array([4.5, 5, None]))])
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With chunked arrays as values
    data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])),
                        ('floats', pa.chunked_array([[4.5], [5, None]]))])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With lists as values
    data = OrderedDict([('strs', [u'', u'foo', u'bar']),
                        ('floats', [4.5, 5, None])])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema
    assert table.to_pydict() == data

    # With metadata and inferred schema
    metadata = {b'foo': b'bar'}
    schema = schema.add_metadata(metadata)
    table = pa.Table.from_pydict(data, metadata=metadata)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # With explicit schema
    table = pa.Table.from_pydict(data, schema=schema)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # Cannot pass both schema and metadata
    with pytest.raises(ValueError):
        pa.Table.from_pydict(data, schema=schema, metadata=metadata)
Beispiel #6
0
def test_chunked_array_str():
    data = [
        pa.array([1, 2, 3]),
        pa.array([4, 5, 6])
    ]
    data = pa.chunked_array(data)
    assert str(data) == """[
Beispiel #7
0
def test_chunked_array_basics():
    data = pa.chunked_array([], type=pa.string())
    assert data.type == pa.string()
    assert data.to_pylist() == []

    with pytest.raises(ValueError):
        pa.chunked_array([])

    data = pa.chunked_array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    assert isinstance(data.chunks, list)
    assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks)
    assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks())
    assert len(data.chunks) == 3
Beispiel #8
0
def simple_dicts_table():
    dict_values = pa.array(["foo", "baz", "quux"], type=pa.utf8())
    data = [
        pa.chunked_array([
            pa.DictionaryArray.from_arrays([1, 0, None], dict_values),
            pa.DictionaryArray.from_arrays([2, 1], dict_values)]),
    ]
    return pa.Table.from_arrays(data, names=['some_dicts'])
Beispiel #9
0
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert result.equals(array)
Beispiel #10
0
def test_chunked_array_equals():
    def eq(xarrs, yarrs):
        if isinstance(xarrs, pa.ChunkedArray):
            x = xarrs
        else:
            x = pa.chunked_array(xarrs)
        if isinstance(yarrs, pa.ChunkedArray):
            y = yarrs
        else:
            y = pa.chunked_array(yarrs)
        assert x.equals(y)
        assert y.equals(x)
        assert x == y
        assert x != str(y)

    def ne(xarrs, yarrs):
        if isinstance(xarrs, pa.ChunkedArray):
            x = xarrs
        else:
            x = pa.chunked_array(xarrs)
        if isinstance(yarrs, pa.ChunkedArray):
            y = yarrs
        else:
            y = pa.chunked_array(yarrs)
        assert not x.equals(y)
        assert not y.equals(x)
        assert x != y

    eq(pa.chunked_array([], type=pa.int32()),
       pa.chunked_array([], type=pa.int32()))
    ne(pa.chunked_array([], type=pa.int32()),
       pa.chunked_array([], type=pa.int64()))

    a = pa.array([0, 2], type=pa.int32())
    b = pa.array([0, 2], type=pa.int64())
    c = pa.array([0, 3], type=pa.int32())
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)
Beispiel #11
0
def test_column_pickle():
    arr = pa.chunked_array([[1, 2], [5, 6, 7]], type=pa.int16())
    field = pa.field("ints", pa.int16()).add_metadata({b"foo": b"bar"})
    col = pa.column(field, arr)

    result = pickle.loads(pickle.dumps(col))
    assert result.equals(col)
    assert result.data.num_chunks == 2
    assert result.field == field
Beispiel #12
0
def chunked_arrays(draw, type, min_chunks=0, max_chunks=None, chunk_size=None):
    if isinstance(type, st.SearchStrategy):
        type = draw(type)

    # TODO(kszucs): remove it, field metadata is not kept
    h.assume(not pa.types.is_struct(type))

    chunk = arrays(type, size=chunk_size)
    chunks = st.lists(chunk, min_size=min_chunks, max_size=max_chunks)

    return pa.chunked_array(draw(chunks), type=type)
Beispiel #13
0
def test_chunked_array_iter():
    data = [
        pa.array([0]),
        pa.array([1, 2, 3]),
        pa.array([4, 5, 6]),
        pa.array([7, 8, 9])
    ]
    arr = pa.chunked_array(data)

    for i, j in zip(range(10), arr):
        assert i == j

    assert isinstance(arr, Iterable)
Beispiel #14
0
def test_unique_simple():
    cases = [
        (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])),
        (pa.array(['foo', None, 'bar', 'foo']),
         pa.array(['foo', 'bar']))
    ]
    for arr, expected in cases:
        result = arr.unique()
        assert result.equals(expected)
        result = pa.column("column", arr).unique()
        assert result.equals(expected)
        result = pa.chunked_array([arr]).unique()
        assert result.equals(expected)
Beispiel #15
0
def test_dictionary_encode_simple():
    cases = [
        (pa.array([1, 2, 3, None, 1, 2, 3]),
         pa.DictionaryArray.from_arrays(
             pa.array([0, 1, 2, None, 0, 1, 2], type='int32'),
             [1, 2, 3])),
        (pa.array(['foo', None, 'bar', 'foo']),
         pa.DictionaryArray.from_arrays(
             pa.array([0, None, 1, 0], type='int32'),
             ['foo', 'bar']))
    ]
    for arr, expected in cases:
        result = arr.dictionary_encode()
        assert result.equals(expected)
        result = pa.column("column", arr).dictionary_encode()
        assert result.data.chunk(0).equals(expected)
        result = pa.chunked_array([arr]).dictionary_encode()
        assert result.chunk(0).equals(expected)
Beispiel #16
0
def test_chunked_array_flatten():
    ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())])
    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    carr = pa.chunked_array(a)
    x, y = carr.flatten()
    assert x.equals(pa.chunked_array(pa.array([1, 3, 5], type=pa.int16())))
    assert y.equals(
        pa.chunked_array(pa.array([2.5, 4.5, 6.5], type=pa.float32())))

    # Empty column
    a = pa.array([], type=ty)
    carr = pa.chunked_array(a)
    x, y = carr.flatten()
    assert x.equals(pa.chunked_array(pa.array([], type=pa.int16())))
    assert y.equals(pa.chunked_array(pa.array([], type=pa.float32())))
def _render_minimum_or_maximum(table, colnames, outcolname, fn):
    if not colnames:
        return ArrowRenderResult(table)

    out_np_arrays = []

    num_chunks = table[colnames[0]].num_chunks
    for chunk in range(num_chunks):
        in_np_arrays = [
            table[colname].chunk(chunk).to_numpy(zero_copy_only=False)
            for colname in colnames
        ]
        out_np_array = fn.reduce(in_np_arrays)
        out_np_arrays.append(out_np_array)

    if outcolname in table.column_names:
        table = table.remove_column(table.column_names.index(outcolname))

    table = table.append_column(
        outcolname, pa.chunked_array(out_np_arrays, pa.timestamp("ns")))
    return ArrowRenderResult(table)
Beispiel #18
0
def test_column_factory_function():
    # ARROW-1575
    arr = pa.array([0, 1, 2, 3, 4])
    arr2 = pa.array([5, 6, 7, 8])

    col1 = pa.Column.from_array('foo', arr)
    col2 = pa.Column.from_array(pa.field('foo', arr.type), arr)

    assert col1.equals(col2)

    col3 = pa.column('foo', [arr, arr2])
    chunked_arr = pa.chunked_array([arr, arr2])
    col4 = pa.column('foo', chunked_arr)
    assert col3.equals(col4)

    col5 = pa.column('foo', arr.to_pandas())
    assert col5.equals(pa.column('foo', arr))

    # Type mismatch
    with pytest.raises(ValueError):
        pa.Column.from_array(pa.field('foo', pa.string()), arr)
Beispiel #19
0
    def __arrow_array__(self, type=None):
        # type: (pa.DataType,) -> pa.Array
        """
        Implement pyarrow array interface (requires pyarrow>=0.15).

        Returns
        -------
        pa.Array

        """
        if self._has_single_chunk:
            data = self.data.chunks[0]
        else:
            data = pa.concat_arrays(self.data.iterchunks())
            self.data = pa.chunked_array([data
                                          ])  # modify a data pointer inplace

        if type is not None and type != data.type:
            return data.cast(type, safe=False)
        else:
            return data
Beispiel #20
0
def test_column_factory_function():
    # ARROW-1575
    arr = pa.array([0, 1, 2, 3, 4])
    arr2 = pa.array([5, 6, 7, 8])

    col1 = pa.Column.from_array('foo', arr)
    col2 = pa.Column.from_array(pa.field('foo', arr.type), arr)

    assert col1.equals(col2)

    col3 = pa.column('foo', [arr, arr2])
    chunked_arr = pa.chunked_array([arr, arr2])
    col4 = pa.column('foo', chunked_arr)
    assert col3.equals(col4)

    col5 = pa.column('foo', arr.to_pandas())
    assert col5.equals(pa.column('foo', arr))

    # Type mismatch
    with pytest.raises(ValueError):
        pa.Column.from_array(pa.field('foo', pa.string()), arr)
Beispiel #21
0
    def __setitem__(self, key, value):
        if isinstance(value, (pd.Index, pd.Series)):
            value = value.to_numpy()

        key = check_array_indexer(self, key)
        scalar_key = is_scalar(key)

        # validate new items
        if scalar_key:
            if pd.isna(value):
                value = None
            elif not is_list_like(value):
                raise ValueError('Must provide list.')

        if self._use_arrow:
            array = np.asarray(self._arrow_array.to_pandas())
            array[key] = value
            self._arrow_array = pa.chunked_array(
                [pa.array(array, type=self.dtype.arrow_type)])
        else:
            self._ndarray[key] = value
Beispiel #22
0
 def reader():
     record_batches = []
     for fragment in fragments:
         for scan_task in fragment.scan(batch_size=chunk_size,
                                        use_threads=False,
                                        columns=columns):
             for record_batch in scan_task.execute():
                 record_batches.append((record_batch))
     dict_or_list_of_arrays = collections.defaultdict(list)
     for rb in record_batches:
         for name, array in zip(rb.schema.names, rb.columns):
             dict_or_list_of_arrays[name].append(array)
     chunks = {
         name: pa.chunked_array(arrays)
         for name, arrays in dict_or_list_of_arrays.items()
     }
     for name, chunk in chunks.items():
         assert len(
             chunk
         ) == rows_planned, f'Oops, got a chunk ({name}) of length {len(chunk)} while it is expected to be of length {rows_planned}'
     return chunks
Beispiel #23
0
    def fillna(self, value=None, method=None, limit=None):
        cls = type(self)

        if pa is None or self._force_use_pandas:
            # pyarrow not installed
            return cls(
                pd.Series(self.to_numpy()).fillna(value=value,
                                                  method=method,
                                                  limit=limit))

        chunks = []
        for chunk_array in self._arrow_array.chunks:
            array = chunk_array.to_pandas()
            if method is None:
                result_array = self._array_fillna(array, value)
            else:
                result_array = array.fillna(value=value,
                                            method=method,
                                            limit=limit)
            chunks.append(pa.array(result_array, from_pandas=True))
        return cls(pa.chunked_array(chunks), dtype=self._dtype)
Beispiel #24
0
 def test_arrow_chunked_struct(self):
     if pyarrow is None:
         pytest.skip("unable to import pyarrow")
     else:
         a = pyarrow.chunked_array([
             pyarrow.array([{
                 "x": 1,
                 "y": 1.1
             }, {
                 "x": 2,
                 "y": 2.2
             }, {
                 "x": 3,
                 "y": 3.3
             }]),
             pyarrow.array([]),
             pyarrow.array([{
                 "x": 4,
                 "y": 4.4
             }, {
                 "x": 5,
                 "y": 5.5
             }])
         ])
         assert awkward.arrow.fromarrow(a).tolist() == [{
             "x": 1,
             "y": 1.1
         }, {
             "x": 2,
             "y": 2.2
         }, {
             "x": 3,
             "y": 3.3
         }, {
             "x": 4,
             "y": 4.4
         }, {
             "x": 5,
             "y": 5.5
         }]
def test_chunked_array_getitem():
    data = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
    data = pa.chunked_array(data)
    assert data[1].as_py() == 2
    assert data[-1].as_py() == 6
    assert data[-6].as_py() == 1
    with pytest.raises(IndexError):
        data[6]
    with pytest.raises(IndexError):
        data[-7]
    # Ensure this works with numpy scalars
    assert data[np.int32(1)].as_py() == 2

    data_slice = data[2:4]
    assert data_slice.to_pylist() == [3, 4]

    data_slice = data[4:-1]
    assert data_slice.to_pylist() == [5]

    data_slice = data[99:99]
    assert data_slice.type == data.type
    assert data_slice.to_pylist() == []
Beispiel #26
0
def test_chunked_array_slice():
    data = [
        pa.array([1, 2, 3]),
        pa.array([4, 5, 6])
    ]
    data = pa.chunked_array(data)

    data_slice = data.slice(len(data))
    assert data_slice.type == data.type
    assert data_slice.to_pylist() == []

    data_slice = data.slice(len(data) + 10)
    assert data_slice.type == data.type
    assert data_slice.to_pylist() == []

    table = pa.Table.from_arrays([data], names=["a"])
    table_slice = table.slice(len(table))
    assert len(table_slice) == 0

    table = pa.Table.from_arrays([data], names=["a"])
    table_slice = table.slice(len(table) + 10)
    assert len(table_slice) == 0
Beispiel #27
0
 def __getitem__(self, item):
     if isinstance(item, slice):
         chunks = []
         ds = self.ds.__getitem__(item)
         for chunk_start, chunk_end, reader in ds.chunk_iterator(
             [self.name]):
             ar = reader()[self.name]
             if isinstance(ar, pa.ChunkedArray):
                 chunks.extend(ar.chunks)
             else:
                 chunks.append(ar)
         if len(chunks) == 1:
             return chunks[0]
         if any([
                 isinstance(k, vaex.array_types.supported_arrow_array_types)
                 for k in chunks
         ]):
             return pa.chunked_array([k for k in chunks])
         else:
             return np.concatenate(chunks)
     else:
         raise NotImplementedError
Beispiel #28
0
    def _set_via_chunk_iteration(self, indices: npt.NDArray[np.intp],
                                 value: npt.NDArray[Any]) -> pa.ChunkedArray:
        """
        Loop through the array chunks and set the new values while
        leaving the chunking layout unchanged.
        """

        chunk_indices = self._within_chunk_indices(indices)
        new_data = []

        for i, chunk in enumerate(self._data.iterchunks()):

            c_ind = chunk_indices[i]
            n = len(c_ind)
            c_value, value = value[:n], value[n:]

            if n == 1:
                # fast path
                chunk = self._set_single_index_in_chunk(
                    chunk, c_ind[0], c_value[0])
            elif n > 0:
                mask = np.zeros(len(chunk), dtype=np.bool_)
                mask[c_ind] = True
                if not pa_version_under5p0:
                    if c_value is None or isna(np.array(c_value)).all():
                        chunk = pc.if_else(mask, None, chunk)
                    else:
                        chunk = pc.replace_with_mask(chunk, mask, c_value)
                else:
                    # The pyarrow compute functions were added in
                    # version 5.0. For prior versions we implement
                    # our own by converting to numpy and back.
                    chunk = chunk.to_numpy(zero_copy_only=False)
                    chunk[mask] = c_value
                    chunk = pa.array(chunk, type=pa.string())

            new_data.append(chunk)

        return pa.chunked_array(new_data)
Beispiel #29
0
def _2(a: pa.Array, b: Any, op: Callable):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        new_chunks = []
        offsets = _calculate_chunk_offsets(b)
        for chunk, offset in zip(b.iterchunks(), offsets):
            new_chunks.append(
                np_ufunc_op(a[offset:offset + len(chunk)], chunk, op))
        return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        np_arr_a = _extract_data_buffer_as_np_array(a)
        np_arr_b = _extract_data_buffer_as_np_array(b)
        if a.null_count > 0 and b.null_count > 0:
            # TODO: Combine them before extracting
            mask_a = extract_isnull_bytemap(a)
            mask_b = extract_isnull_bytemap(b)
            mask = mask_a | mask_b
        elif a.null_count > 0:
            mask = extract_isnull_bytemap(a)
        elif b.null_count > 0:
            mask = extract_isnull_bytemap(b)
        else:
            mask = None

        new_arr = op(np_arr_a, np_arr_b)
        # Don't set type as we might have valid casts like int->float in truediv
        return pa.array(new_arr, mask=mask)
    else:
        # b is non-masked, either array-like or scalar
        # numpy can handle all types of b from here
        np_arr = _extract_data_buffer_as_np_array(a)
        if a.null_count > 0:
            mask = extract_isnull_bytemap(a)
        else:
            mask = None
        new_arr = op(np_arr, b)
        # Don't set type as we might have valid casts like int->float in truediv
        return pa.array(new_arr, mask=mask)
Beispiel #30
0
def convert(x, type, default_type="numpy"):
    import vaex.column
    if type == "numpy":
        if isinstance(x, (list, tuple)):
            return concat([convert(k, type) for k in x])
        else:
            return to_numpy(x, strict=True)
    if type == "numpy-arrow":  # used internally, numpy if possible, otherwise arrow
        if isinstance(x, (list, tuple)):
            return concat([convert(k, type) for k in x])
        else:
            return to_numpy(x, strict=False)

    elif type == "arrow":
        if isinstance(x, (list, tuple)):
            return pa.chunked_array([convert(k, type) for k in x])
        else:
            return to_arrow(x)
    elif type == "xarray":
        return to_xarray(x)
    elif type in ['list', 'python']:
        return convert(x, 'numpy').tolist()
    elif type is None:
        if isinstance(x, (list, tuple)):
            chunks = [convert(k, type) for k in x]
            if isinstance(
                    chunks[0],
                (pa.Array, pa.ChunkedArray, vaex.column.ColumnStringArrow)):
                return convert(chunks, "arrow")
            elif isinstance(chunks[0], np.ndarray):
                return convert(chunks, "numpy")
            else:
                raise ValueError("Unknown type: %r" % chunks[0])
        else:
            # return convert(x, Nonedefault_type)
            return x
    else:
        raise ValueError("Unknown type: %r" % type)
Beispiel #31
0
 def test_arrow_chunked_struct(self):
     if pyarrow is not None:
         a = pyarrow.chunked_array([
             pyarrow.array([{
                 "x": 1,
                 "y": 1.1
             }, {
                 "x": 2,
                 "y": 2.2
             }, {
                 "x": 3,
                 "y": 3.3
             }]),
             pyarrow.array([]),
             pyarrow.array([{
                 "x": 4,
                 "y": 4.4
             }, {
                 "x": 5,
                 "y": 5.5
             }])
         ])
         assert awkward.arrow.view(a).tolist() == [{
             "x": 1,
             "y": 1.1
         }, {
             "x": 2,
             "y": 2.2
         }, {
             "x": 3,
             "y": 3.3
         }, {
             "x": 4,
             "y": 4.4
         }, {
             "x": 5,
             "y": 5.5
         }]
Beispiel #32
0
def test_cast_kernel_on_extension_arrays():
    # test array casting
    storage = pa.array([1, 2, 3, 4], pa.int64())
    arr = pa.ExtensionArray.from_storage(IntegerType(), storage)

    # test that no allocation happens during identity cast
    allocated_before_cast = pa.total_allocated_bytes()
    casted = arr.cast(pa.int64())
    assert pa.total_allocated_bytes() == allocated_before_cast

    cases = [(pa.int64(), pa.Int64Array), (pa.int32(), pa.Int32Array),
             (pa.int16(), pa.Int16Array), (pa.uint64(), pa.UInt64Array),
             (pa.uint32(), pa.UInt32Array), (pa.uint16(), pa.UInt16Array)]
    for typ, klass in cases:
        casted = arr.cast(typ)
        assert casted.type == typ
        assert isinstance(casted, klass)

    # test chunked array casting
    arr = pa.chunked_array([arr, arr])
    casted = arr.cast(pa.int16())
    assert casted.type == pa.int16()
    assert isinstance(casted, pa.ChunkedArray)
Beispiel #33
0
def test_chunked_array_getitem():
    data = [
        pa.array([1, 2, 3]),
        pa.array([4, 5, 6])
    ]
    data = pa.chunked_array(data)
    assert data[1].as_py() == 2
    assert data[-1].as_py() == 6
    assert data[-6].as_py() == 1
    with pytest.raises(IndexError):
        data[6]
    with pytest.raises(IndexError):
        data[-7]

    data_slice = data[2:4]
    assert data_slice.to_pylist() == [3, 4]

    data_slice = data[4:-1]
    assert data_slice.to_pylist() == [5]

    data_slice = data[99:99]
    assert data_slice.type == data.type
    assert data_slice.to_pylist() == []
Beispiel #34
0
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(b)
        for chunk, offset in zip(b.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(a[offset:offset + len(chunk)],
                                            chunk, ops))
        return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        return ops.get("array_array", _not_implemented_path)(a, b)
    else:
        if np.isscalar(b):
            return ops.get("array_scalar", _not_implemented_path)(a, b)
        else:
            if len(a) != len(b):
                raise ValueError("Inputs don't have the same length.")
            return ops.get("array_nparray", _not_implemented_path)(a, b)
def _render_difference(table, colname1, colname2, unit, outcolname):
    if not colname1 or not colname2:
        return ArrowRenderResult(table)

    out_arrays = []
    if unit == "nanosecond":
        out_type = pa.int64()
        out_metadata = {"format": "{:,d}"}
    else:
        out_type = pa.float64()
        out_metadata = {"format": "{:,}"}
    num_chunks = table[colname1].num_chunks
    for chunk in range(num_chunks):
        chunk1 = table[colname1].chunk(chunk).cast(pa.int64())
        chunk2 = table[colname2].chunk(chunk).cast(pa.int64())
        # TODO subtract_checked and report error
        difference_in_ns = pa.compute.subtract(chunk2, chunk1)

        if unit == "nanosecond":
            # Nanosecond differences are integers
            out_array = difference_in_ns
        else:
            out_array = pa.compute.divide(
                difference_in_ns.cast(pa.float64(), safe=False),
                pa.scalar(_NS_PER_UNIT[unit], pa.float64()),
            )
        out_arrays.append(out_array)

    if outcolname in table.column_names:
        table = table.remove_column(table.column_names.index(outcolname))

    table = table.append_column(
        pa.field(outcolname, out_type, metadata=out_metadata),
        pa.chunked_array(out_arrays, out_type),
    )

    return ArrowRenderResult(table)
Beispiel #36
0
def test_combined_in_chunk_offsets():
    a = pa.chunked_array([[]])
    b = pa.chunked_array([[]])
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)
    assert in_a_offsets == [(0, 0, 0)]
    assert in_b_offsets == [(0, 0, 0)]

    a = pa.chunked_array([[1]])
    b = pa.chunked_array([[2]])
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)
    assert in_a_offsets == [(0, 0, 1)]
    assert in_b_offsets == [(0, 0, 1)]

    a = pa.chunked_array([[1, 2], [3, 4, 5]])
    b = pa.chunked_array([[1], [2, 3], [4, 5]])
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)
    assert in_a_offsets == [(0, 0, 1), (0, 1, 1), (1, 0, 1), (1, 1, 2)]
    assert in_b_offsets == [(0, 0, 1), (1, 0, 1), (1, 1, 1), (2, 0, 2)]
Beispiel #37
0
def test_reduce_op_no_identity(data, skipna, op, pandas_op):
    arrow = pa.array(data, type=pa.float64(), from_pandas=True)
    pandas = pd.Series(data, dtype=float)
    should_raise = arrow.null_count == len(arrow) and (skipna or len(arrow) == 0)

    if should_raise:
        with pytest.raises(ValueError):
            assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
    else:
        assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))

    # Split in the middle and check whether this still works
    if len(data) > 2:
        arrow = pa.chunked_array(
            [
                pa.array(data[: len(data) // 2], type=pa.float64(), from_pandas=True),
                pa.array(data[len(data) // 2 :], type=pa.float64(), from_pandas=True),
            ]
        )
        if should_raise:
            with pytest.raises(ValueError):
                assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
        else:
            assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
Beispiel #38
0
    def _take_on_chunks(self, indices, limits_idx, cum_lengths, sort_idx=None):
        def take_in_one_chunk(i_chunk):
            indices_chunk = indices[limits_idx[i_chunk]:limits_idx[i_chunk +
                                                                   1]]
            indices_chunk -= cum_lengths[i_chunk]
            if (self.dtype.is_list
                    and self.data.chunk(i_chunk).flatten().null_count == 0
                    and self.data.chunk(i_chunk).null_count == 0
                    and self.flatten().dtype._is_numeric):
                return take_indices_on_pyarrow_list(self.data.chunk(i_chunk),
                                                    indices_chunk)
            else:
                return self.data.chunk(i_chunk).take(pa.array(indices_chunk))
            # this is a pyarrow.Array

        result = [take_in_one_chunk(i) for i in range(self.data.num_chunks)]
        # we know that self.data.num_chunks >1

        if sort_idx is None:
            return FletcherArray(
                pa.chunked_array(filter(len, result), type=self.data.type))
        else:
            return FletcherArray(
                pa.concat_arrays(result).take(pa.array(sort_idx)))
Beispiel #39
0
def test_dictionary_array_automatically_read(use_legacy_dataset):
    # ARROW-3246

    # Make a large dictionary, a little over 4MB of data
    dict_length = 4000
    dict_values = pa.array([('x' * 1000 + '_{}'.format(i))
                            for i in range(dict_length)])

    num_chunks = 10
    chunk_size = 100
    chunks = []
    for i in range(num_chunks):
        indices = np.random.randint(0, dict_length,
                                    size=chunk_size).astype(np.int32)
        chunks.append(
            pa.DictionaryArray.from_arrays(pa.array(indices), dict_values))

    table = pa.table([pa.chunked_array(chunks)], names=['f0'])
    result = _simple_table_write_read(table, use_legacy_dataset)

    assert result.equals(table)

    # The only key in the metadata was the Arrow schema key
    assert result.schema.metadata is None
Beispiel #40
0
def test_chunked_array_basics():
    data = pa.chunked_array([], type=pa.string())
    assert data.to_pylist() == []

    with pytest.raises(ValueError):
        pa.chunked_array([])
Beispiel #41
0
def test_chunked_array_str():
    data = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
    data = pa.chunked_array(data)
    assert str(data) == """[
Beispiel #42
0
def test_chunked_array_mismatch_types():
    with pytest.raises(pa.ArrowInvalid):
        pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
Beispiel #43
0
 def wrapper(array):
     if isinstance(array, pa.ChunkedArray):
         return pa.chunked_array(
             [func(chunk) for chunk in array.chunks])
     else:
         return func(array)
Beispiel #44
0
    def __init__(self, dfs, name=None):
        from vaex.column import ColumnConcatenatedLazy
        crs = np.array([df.geometry.crs.srs for df in dfs])
        crs = np.unique(crs)
        if len(crs) > 1:
            raise ValueError(
                'Concatenating dataframes where different crs not supported.')
        else:
            crs = crs[0] if len(crs) == 1 else None
        metadata = dfs[0]._metadata
        geoms = []
        for df in dfs:
            if isinstance(df.geometry._geometry, pa.Array):
                geoms.append(df.geometry._geometry)
            elif isinstance(df.geometry._geometry, pa.ChunkedArray):
                for chunk in df.geometry._geometry.chunks:
                    geoms.append(chunk)
            else:
                geoms.append(pa.array(df.geometry._geometry))
        geometry = pa.chunked_array(geoms)

        super(GeoDataFrameConcatenated, self).__init__(geometry,
                                                       crs=crs,
                                                       metadata=metadata)

        self.dfs = dfs = [df.extract() for df in dfs]
        self.name = name or "-".join(df.name for df in self.dfs)
        self.path = "-".join(df.path for df in self.dfs)
        first, tail = dfs[0], dfs[1:]
        for column_name in first.get_column_names(virtual=False,
                                                  hidden=True,
                                                  alias=False):
            if all([
                    column_name in df.get_column_names(virtual=False,
                                                       hidden=True,
                                                       alias=False)
                    for df in tail
            ]):
                self.column_names.append(column_name)
        self.columns = {}
        for column_name in self.get_column_names(virtual=False,
                                                 hidden=True,
                                                 alias=False):
            self.columns[column_name] = ColumnConcatenatedLazy(
                [df[column_name] for df in dfs])
            self._save_assign_expression(column_name)

        for name in list(first.virtual_columns.keys()):
            if all([
                    first.virtual_columns[name] == df.virtual_columns.get(
                        name, None) for df in tail
            ]):
                self.add_virtual_column(name, first.virtual_columns[name])
            else:
                self.columns[name] = ColumnConcatenatedLazy(
                    [df[name] for df in dfs])
                self.column_names.append(name)
            self._save_assign_expression(name)

        for df in tail:
            if first._column_aliases != df._column_aliases:
                raise ValueError(
                    f'Concatenating dataframes where different column aliases not supported: {first._column_aliases} != {df._column_aliases}'
                )
        self._column_aliases = first._column_aliases.copy()

        for df in dfs[:1]:
            for name, value in list(df.variables.items()):
                if name not in self.variables:
                    self.set_variable(name, value, write=False)
        # self.write_virtual_meta()

        self._length_unfiltered = sum(len(ds) for ds in self.dfs)
        self._length_original = self._length_unfiltered
        self._index_end = self._length_unfiltered
Beispiel #45
0
    def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
        """Set one or more values inplace.

        Parameters
        ----------
        key : int, ndarray, or slice
            When called from, e.g. ``Series.__setitem__``, ``key`` will be
            one of

            * scalar int
            * ndarray of integers.
            * boolean ndarray
            * slice object

        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
            value or values to be set of ``key``.

        Returns
        -------
        None
        """
        key = check_array_indexer(self, key)

        if is_integer(key):
            key = cast(int, key)

            if not is_scalar(value):
                raise ValueError("Must pass scalars with scalar indexer")
            elif isna(value):
                value = None
            elif not isinstance(value, str):
                raise ValueError("Scalar must be NA or str")

            # Slice data and insert in-between
            new_data = [
                *self._data[0:key].chunks,
                pa.array([value], type=pa.string()),
                *self._data[(key + 1):].chunks,
            ]
            self._data = pa.chunked_array(new_data)
        else:
            # Convert to integer indices and iteratively assign.
            # TODO: Make a faster variant of this in Arrow upstream.
            #       This is probably extremely slow.

            # Convert all possible input key types to an array of integers
            if isinstance(key, slice):
                key_array = np.array(range(len(self))[key])
            elif is_bool_dtype(key):
                # TODO(ARROW-9430): Directly support setitem(booleans)
                key_array = np.argwhere(key).flatten()
            else:
                # TODO(ARROW-9431): Directly support setitem(integers)
                key_array = np.asanyarray(key)

            if is_scalar(value):
                value = np.broadcast_to(value, len(key_array))
            else:
                value = np.asarray(value)

            if len(key_array) != len(value):
                raise ValueError("Length of indexer and values mismatch")

            for k, v in zip(key_array, value):
                self[k] = v
Beispiel #46
0
 def wrapper(arr: Union[pa.Array, pa.ChunkedArray], *args, **kwargs):
     if isinstance(arr, pa.ChunkedArray):
         return pa.chunked_array(
             [func(chunk, *args, **kwargs) for chunk in arr.chunks])
     else:
         return func(arr, *args, **kwargs)
Beispiel #47
0
 def _concat_same_type(cls, to_concat):
     chunks = list(itertools.chain.from_iterable(x._data.chunks
                                                 for x in to_concat))
     arr = pa.chunked_array(chunks)
     return cls(arr)
Beispiel #48
0
def test_np_ufunc_op_chunked_scalar():
    a = pa.chunked_array([[1, 2], [3, None]])
    b = 4
    expected = pa.array([5, 6, 7, None])
    check_np_ufunc(a, b, expected)
Beispiel #49
0
 def from_scalars(cls, values):
     arr = pa.chunked_array([pa.array(np.asarray(values))])
     return cls(arr)
Beispiel #50
0
 def from_array(cls, arr):
     assert isinstance(arr, pa.Array)
     return cls(pa.chunked_array([arr]))