def test_arrow_array(vineyard_client): arr = pa.array([1, 2, None, 3]) object_id = vineyard_client.put(arr) assert arr.equals(vineyard_client.get(object_id)) arr = pa.array([1, 2.0, None, 3.0]) object_id = vineyard_client.put(arr) assert arr.equals(vineyard_client.get(object_id)) arr = pa.array([None, None, None, None]) object_id = vineyard_client.put(arr) assert arr.equals(vineyard_client.get(object_id)) arr = pa.array(["a", None, None, None]) object_id = vineyard_client.put(arr) assert arr.cast(pa.large_string()).equals(vineyard_client.get(object_id)) arr = pa.array(["a", "bb", "ccc", "dddd"]) object_id = vineyard_client.put(arr) assert arr.cast(pa.large_string()).equals(vineyard_client.get(object_id)) arr = pa.array([True, False, True, False]) object_id = vineyard_client.put(arr) assert arr.equals(vineyard_client.get(object_id)) arr = pa.array([True, False, None, None]) object_id = vineyard_client.put(arr) assert arr.equals(vineyard_client.get(object_id)) nested_arr = pa.array([[], None, [1, 2], [None, 1]]) object_id = vineyard_client.put(nested_arr) assert vineyard_client.get(object_id).values.equals(nested_arr.values)
def __arrow_array__(self, type=None): offsets = self.indices type = type or self.dtype if type == pa.string() and self.dtype == pa.large_string(): offsets = offsets.astype(np.int32) # downcast elif type == pa.large_string() and self.dtype == pa.string(): type = pa.string() # upcast # this code is very similar to vaex.arrow.convert.trim_buffers new_offset = self.offset % 8 remove_offset = (self.offset // 8) * 8 offsets = _asnumpy( _trim(offsets, remove_offset, self.offset + self.length + 1)) first_offset = offsets[0] last_offset = offsets[new_offset + self.length] new_offsets = offsets - first_offset if self.null_bitmap is not None: null_bitmap, null_offset = _trim_bits(self.null_bitmap, self.offset, self.offset + self.length) assert null_offset == new_offset null_bitmap = pa.py_buffer(_asnumpy(null_bitmap)) else: null_bitmap = None new_offsets_buffer = pa.py_buffer(new_offsets) bytes = pa.py_buffer( _asnumpy(_trim(self.bytes, first_offset, last_offset))) return pa.Array.from_buffers(type, self.length, [null_bitmap, new_offsets_buffer, bytes], offset=new_offset)
def test_where_large(): df = vaex.from_arrays( s=pa.array(['a', 'b', None, 'd'], type=pa.large_string())) assert (df['s'] + df['s']).dtype.internal == pa.large_string() expr = df.func.where(df['s'] == 'a', 'A', df['s']) assert expr.tolist() == ['A', 'b', None, 'd'] assert expr.dtype.is_string
def test_large_string_type(self, duckdb_cursor): if not can_run: return schema = pa.schema([("data", pa.large_string())]) inputs = [pa.array(["foo", "baaaar", "b"], type=pa.large_string())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) rel = duckdb.from_arrow(arrow_table) res = rel.execute().fetchall() assert res == [('foo', ), ('baaaar', ), ('b', )]
def __arrow_array__(self, type=None): indices = self.indices type = type or self.dtype if type == pa.string() and self.dtype == pa.large_string(): indices = indices.astype(np.int32) # downcast elif type == pa.large_string() and self.dtype == pa.string(): type = pa.string() # upcast # TODO: we dealloc the memory in the C++ extension, so we need to copy for now buffers = [None, pa.py_buffer(_asnumpy(indices).copy() - self.offset), pa.py_buffer(_asnumpy(self.bytes).view(np.uint8).copy()), ] if self.null_bitmap is not None: assert self.null_offset == 0 #self.offset buffers[0] = pa.py_buffer(self.null_bitmap.copy()) arrow_array = pa.Array.from_buffers(type, self.length, buffers=buffers) return arrow_array
def testIsBinaryLike(self): for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()): self.assertTrue(arrow_util.is_binary_like(t)) for t in (pa.list_(pa.binary()), pa.large_list(pa.string())): self.assertFalse(arrow_util.is_binary_like(t))
def decode(encoding, type_spec): if type_spec == 'string': return pa.string() if type_spec == 'large_string': return pa.large_string() else: return np.dtype(type_spec)
def decode(encoding, type_spec): if isinstance(type_spec, dict): if type_spec['type'] == 'duration': return DataType(pa.duration(type_spec['unit'])) elif type_spec['type'] == 'timestamp': return DataType(pa.timestamp(type_spec['unit'])) elif type_spec['type'] == 'list': sub = encoding.decode('dtype', type_spec['value_type']).arrow return DataType(pa.list_(sub)) elif type_spec['type'] == 'dict': value_type = encoding.decode('dtype', type_spec["value_type"]).arrow index_type = encoding.decode('dtype', type_spec["index_type"]).arrow bool_ordered = type_spec["ordered"] return DataType(pa.dictionary(index_type, value_type, bool_ordered)) else: raise ValueError(f'Do not understand type {type_spec}') if type_spec == 'string': return DataType(pa.string()) if type_spec == 'large_string': return DataType(pa.large_string()) # TODO: find a proper way to support all arrow types if type_spec == 'timestamp[ms]': return DataType(pa.timestamp('ms')) else: return DataType(np.dtype(type_spec))
def _get_binary_like_byte_size_test_cases(): result = [] for array_type, sizeof_offsets in [ (pa.binary(), 4), (pa.string(), 4), (pa.large_binary(), 8), (pa.large_string(), 8), ]: result.append( dict( testcase_name=str(array_type), array=pa.array([ "a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg", "hhhhhhhh", "iiiiiiiii" ], type=array_type), slice_offset=1, slice_length=3, # contents: 45 # offsets: 10 * sizeof_offsets # null bitmap: 2 expected_size=(45 + sizeof_offsets * 10 + 2), # contents: 9 # offsets: 4 * sizeof_offsets # null bitmap: 1 expected_sliced_size=(9 + sizeof_offsets * 4 + 1))) return result
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), pa.large_list(pa.uint16()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()))
def to_arrow(self, type=None): values = [e.values for e in self.expressions] chunks = [ value if isinstance(value, pa.Array) else pa.array(value, type=type) for value in values ] types = [chunk.type for chunk in chunks] # upcast if mixed types if pa.string() in types and pa.large_string() in types: def _arrow_string_upcast(array): if array.type == pa.large_string(): return array if array.type == pa.string(): import vaex.arrow.convert column = vaex.arrow.convert.column_from_arrow_array(array) column.indices = column.indices.astype(np.int64) return pa.array(column) else: raise ValueError('Not a string type: %r' % array) chunks = [_arrow_string_upcast(chunk) for chunk in chunks] return pa.chunked_array(chunks)
def test_large_binary(): data = [b'foo', b'bar'] * 50 for type in [pa.large_binary(), pa.large_string()]: arr = pa.array(data, type=type) table = pa.Table.from_arrays([arr], names=['strs']) for use_dictionary in [False, True]: _check_roundtrip(table, use_dictionary=use_dictionary)
def __init__(self, indices, bytes, length=None, offset=0, string_sequence=None, null_bitmap=None): self._string_sequence = string_sequence self.indices = indices self.offset = offset # to avoid memory copies in trim self.bytes = bytes self.length = length if length is not None else len(indices) - 1 if indices.dtype.kind == 'i' and indices.dtype.itemsize == 8: self.dtype = pa.large_string() elif indices.dtype.kind == 'i' and indices.dtype.itemsize == 4: self.dtype = pa.string() else: raise ValueError('unsupported index type' + str(indices.dtype)) self.shape = (self.__len__(), ) self.nbytes = self.bytes.nbytes + self.indices.nbytes self.null_bitmap = null_bitmap if not (self.indices.dtype.kind == 'i' and self.indices.dtype.itemsize in [4, 8]): raise ValueError('unsupported index type' + str(self.indices.dtype))
def column_from_arrow_array(arrow_array): # TODO: we may be able to pass chunked arrays arrow_array = ensure_not_chunked(arrow_array) arrow_type = arrow_array.type buffers = arrow_array.buffers() if len(buffers) == 2: return numpy_array_from_arrow_array(arrow_array) elif len(buffers) == 3 and arrow_array.type in [ pyarrow.string(), pyarrow.large_string() ]: bitmap_buffer, offsets, string_bytes = arrow_array.buffers() if arrow_array.null_count == 0: null_bitmap = None # we drop any null_bitmap when there are no null counts else: null_bitmap = np.frombuffer(bitmap_buffer, 'uint8', len(bitmap_buffer)) if arrow_array.type == pyarrow.string(): offsets = np.frombuffer(offsets, np.int32, len(offsets) // 4) else: offsets = np.frombuffer(offsets, np.int64, len(offsets) // 8) if string_bytes is None: string_bytes = np.array([], dtype='S1') else: string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes)) offset = arrow_array.offset column = vaex.column.ColumnStringArrow(offsets, string_bytes, len(arrow_array), offset, null_bitmap=null_bitmap) return column else: raise TypeError('type unsupported: %r' % arrow_type)
def trim_buffers(ar): # there are cases where memcopy are made, of modifications are mode (large_string_to_string) # in those cases, we don't want to work on the full array, and get rid of the offset if possible if ar.type == pa.string() or ar.type == pa.large_string(): if isinstance(ar, pa.ChunkedArray): return ar # lets assume chunked arrays are fine null_bitmap, offsets_buffer, bytes = ar.buffers() if ar.type == pa.string(): offsets = np.frombuffer(offsets_buffer, np.int32, len(ar) + 1 + ar.offset) else: offsets = np.frombuffer(offsets_buffer, np.int64, len(ar) + 1 + ar.offset) # because it is difficult to slice bits new_offset = ar.offset % 8 remove_offset = (ar.offset // 8) * 8 first_offset = offsets[remove_offset] new_offsets = offsets[remove_offset:] - first_offset if null_bitmap: null_bitmap = null_bitmap.slice(ar.offset // 8) new_offsets_buffer = pa.py_buffer(new_offsets) bytes = bytes.slice(first_offset) ar = pa.Array.from_buffers(ar.type, len(ar), [null_bitmap, new_offsets_buffer, bytes], offset=new_offset) return ar
def arrow_string_array_from_buffers(bytes, offsets, null_bitmap): if offsets.dtype == np.int32: type = pa.string() elif offsets.dtype == np.int64: type = pa.large_string() else: raise ValueError(f'Unsupported dtype {offsets.dtype} for string offsets') return _arrow_binary_array_from_buffers(bytes, offsets, null_bitmap, type)
def same_type(*arrays): types = [ar.type for ar in arrays] if any(types[0] != type for type in types): if vaex.dtype(types[0]) == str: # we have mixed large and normal string return [large_string_to_string(ar) if ar.type == pa.large_string() else ar for ar in arrays] else: raise NotImplementedError return arrays
def test_large_binary_huge(): s = b'xy' * 997 data = [s] * ((1 << 33) // len(s)) for type in [pa.large_binary(), pa.large_string()]: arr = pa.array(data, type=type) table = pa.Table.from_arrays([arr], names=['strs']) for use_dictionary in [False, True]: _check_roundtrip(table, use_dictionary=use_dictionary) del arr, table
def test_query_with_all_supported_types(self): record_batch = pa.RecordBatch.from_arrays([ pa.array([[1], [2]], type=pa.list_(pa.int32())), pa.array([[10], [20]], type=pa.list_(pa.int64())), pa.array([[1.1], [2.2]], type=pa.list_(pa.float32())), pa.array([[10.1], [20.2]], type=pa.list_(pa.float64())), pa.array([['a'], ['b']], type=pa.list_(pa.string())), pa.array([['a+'], ['b+']], type=pa.list_(pa.large_string())), pa.array([[b'a_bytes'], [b'b_bytes']], type=pa.list_(pa.binary())), pa.array([[b'a_bytes+'], [b'b_bytes+']], type=pa.list_(pa.large_binary())), ], [ 'int32_list', 'int64_list', 'float32_list', 'float64_list', 'string_list', 'large_string_list', 'binary_list', 'large_binary_list', ]) sql = """ SELECT ARRAY( SELECT STRUCT(int32_list, int64_list, float32_list, float64_list, string_list, large_string_list, binary_list, large_binary_list) FROM example.int32_list, example.int64_list, example.float32_list, example.float64_list, example.string_list, example.large_string_list, example.binary_list, example.large_binary_list ) as slice_key FROM Examples as example;""" query = sql_util.RecordBatchSQLSliceQuery(sql, record_batch.schema) slices = query.Execute(record_batch) self.assertEqual(slices, [[[('int32_list', '1'), ('int64_list', '10'), ('float32_list', '1.1'), ('float64_list', '10.1'), ('string_list', 'a'), ('large_string_list', 'a+'), ('binary_list', 'a_bytes'), ('large_binary_list', 'a_bytes+')]], [[('int32_list', '2'), ('int64_list', '20'), ('float32_list', '2.2'), ('float64_list', '20.2'), ('string_list', 'b'), ('large_string_list', 'b+'), ('binary_list', 'b_bytes'), ('large_binary_list', 'b_bytes+')]]])
def string_array_resolver(obj): meta = obj.meta buffer_data = as_arrow_buffer(obj.member('buffer_data_')) buffer_offsets = as_arrow_buffer(obj.member('buffer_offsets_')) null_bitmap = as_arrow_buffer(obj.member('null_bitmap_')) length = int(meta['length_']) null_count = int(meta['null_count_']) offset = int(meta['offset_']) return pa.lib.Array.from_buffers(pa.large_string(), length, [null_bitmap, buffer_offsets, buffer_data], null_count, offset)
def decode(encoding, type_spec): if type_spec == 'string': return DataType(pa.string()) if type_spec == 'large_string': return DataType(pa.large_string()) # TODO: find a proper way to support all arrow types if type_spec == 'timestamp[ms]': return DataType(pa.timestamp('ms')) else: return DataType(np.dtype(type_spec))
def _arrow_string_upcast(array): if array.type == pa.large_string(): return array if array.type == pa.string(): import vaex.arrow.convert column = vaex.arrow.convert.column_from_arrow_array(array) column.indices = column.indices.astype(np.int64) return pa.array(column) else: raise ValueError('Not a string type: %r' % array)
def test_large_string_to_string(offset): s = pa.array(['aap', 'noot', None, 'mies'] * 3, type=pa.large_string()) ns = convert.large_string_to_string(s) assert s.type != ns.type assert s.tolist() == ns.tolist() s = s.slice(offset) ns = convert.large_string_to_string(s) assert ns.offset < 8 assert s.tolist() == ns.tolist()
def _resize_arrow_type(t): if t == pa.string(): return pa.large_string() if t == pa.utf8(): return pa.large_utf8() if t == pa.binary(): return pa.large_binary() if isinstance(t, pa.lib.ListType): return pa.large_list(t.value_type) return t
def test_large_string_to_string(offset, chunked): s = pa.array(['aap', 'noot', None, 'mies'] * 3, type=pa.large_string()) if chunked: s = pa.chunked_array([s.slice(0, 5), s.slice(5)]) ns = convert.large_string_to_string(s) ns.validate() assert s.type != ns.type assert s.to_pylist() == ns.to_pylist() s = s.slice(offset) ns = convert.large_string_to_string(s) if not chunked: assert ns.offset < 8 assert s.to_pylist() == ns.to_pylist()
def _map_arrow_type(arrow_type): arrow_to_dh = { pa.null(): '', pa.bool_(): '', pa.int8(): 'byte', pa.int16(): 'short', pa.int32(): 'int', pa.int64(): 'long', pa.uint8(): '', pa.uint16(): 'char', pa.uint32(): '', pa.uint64(): '', pa.float16(): '', pa.float32(): 'float', pa.float64(): 'double', pa.time32('s'): '', pa.time32('ms'): '', pa.time64('us'): '', pa.time64('ns'): 'io.deephaven.time.DateTime', pa.timestamp('us', tz=None): '', pa.timestamp('ns', tz=None): '', pa.date32(): 'java.time.LocalDate', pa.date64(): 'java.time.LocalDate', pa.binary(): '', pa.string(): 'java.lang.String', pa.utf8(): 'java.lang.String', pa.large_binary(): '', pa.large_string(): '', pa.large_utf8(): '', # decimal128(int precision, int scale=0) # list_(value_type, int list_size=-1) # large_list(value_type) # map_(key_type, item_type[, keys_sorted]) # struct(fields) # dictionary(index_type, value_type, …) # field(name, type, bool nullable = True[, metadata]) # schema(fields[, metadata]) # from_numpy_dtype(dtype) } dh_type = arrow_to_dh.get(arrow_type) if not dh_type: # if this is a case of timestamp with tz specified if isinstance(arrow_type, pa.TimestampType): dh_type = "io.deephaven.time.DateTime" if not dh_type: raise DHError(f'unsupported arrow data type : {arrow_type}') return {"deephaven:type": dh_type}
def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) assert not types.is_binary(pa.large_binary()) assert not types.is_binary(pa.large_string()) assert types.is_string(pa.string()) assert types.is_unicode(pa.string()) assert not types.is_string(pa.binary()) assert not types.is_string(pa.large_string()) assert not types.is_string(pa.large_binary()) assert types.is_large_binary(pa.large_binary()) assert not types.is_large_binary(pa.large_string()) assert not types.is_large_binary(pa.binary()) assert not types.is_large_binary(pa.string()) assert types.is_large_string(pa.large_string()) assert not types.is_large_string(pa.large_binary()) assert not types.is_large_string(pa.string()) assert not types.is_large_string(pa.binary()) assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary())
def decode(encoding, type_spec): if isinstance(type_spec, dict): if type_spec['type'] == 'list': sub = encoding.decode('dtype', type_spec['value_type']).arrow return DataType(pa.list_(sub)) else: raise ValueError(f'Do not understand type {type_spec}') if type_spec == 'string': return DataType(pa.string()) if type_spec == 'large_string': return DataType(pa.large_string()) # TODO: find a proper way to support all arrow types if type_spec == 'timestamp[ms]': return DataType(pa.timestamp('ms')) else: return DataType(np.dtype(type_spec))
def json_type_to_pyarrow_type( typ: str, reverse: bool = False, logger: AirbyteLogger = AirbyteLogger()) -> str: """ Converts Json Type to PyArrow types to (or the other way around if reverse=True) :param typ: Json type if reverse is False, else PyArrow type :param reverse: switch to True for PyArrow type -> Json type, defaults to False :param logger: defaults to AirbyteLogger() :return: PyArrow type if reverse is False, else Json type """ str_typ = str(typ) # this is a map of airbyte types to pyarrow types. The first list element of the pyarrow types should be the one to use where required. map = { "boolean": ("bool_", "bool"), "integer": ("int64", "int8", "int16", "int32", "uint8", "uint16", "uint32", "uint64"), "number": ("float64", "float16", "float32", "decimal128", "decimal256", "halffloat", "float", "double"), "string": ("large_string", "string"), # TODO: support object type rather than coercing to string "object": ("large_string", ), # TODO: support array type rather than coercing to string "array": ("large_string", ), "null": ("large_string", ), } if not reverse: for json_type, pyarrow_types in map.items(): if str_typ.lower() == json_type: return str( getattr(pa, pyarrow_types[0]).__call__() ) # better way might be necessary when we decide to handle more type complexity logger.debug( f"JSON type '{str_typ}' is not mapped, falling back to default conversion to large_string" ) return str(pa.large_string()) else: for json_type, pyarrow_types in map.items(): if any( str_typ.startswith(pa_type) for pa_type in pyarrow_types): return json_type logger.debug( f"PyArrow type '{str_typ}' is not mapped, falling back to default conversion to string" ) return "string" # default type if unspecified in map
def test_large_string_unicode(self): arr = pa.array([u'foo', None, u'mañana'], type=pa.large_string()) v = arr[0] assert isinstance(v, pa.LargeStringValue) assert v.as_py() == u'foo' assert repr(v) == repr(u"foo") assert str(v) == str(u"foo") assert v == u'foo' # Assert that newly created values are equal to the previously created # one. assert v == arr[0] assert arr[1] is pa.NA v = arr[2].as_py() assert v == u'mañana' assert isinstance(v, unicode_type)