Python large_string Exemples, pyarrow.large_string Python Exemples

Exemple #1

0

Afficher le fichier

def test_arrow_array(vineyard_client):
    arr = pa.array([1, 2, None, 3])
    object_id = vineyard_client.put(arr)
    assert arr.equals(vineyard_client.get(object_id))

    arr = pa.array([1, 2.0, None, 3.0])
    object_id = vineyard_client.put(arr)
    assert arr.equals(vineyard_client.get(object_id))

    arr = pa.array([None, None, None, None])
    object_id = vineyard_client.put(arr)
    assert arr.equals(vineyard_client.get(object_id))

    arr = pa.array(["a", None, None, None])
    object_id = vineyard_client.put(arr)
    assert arr.cast(pa.large_string()).equals(vineyard_client.get(object_id))

    arr = pa.array(["a", "bb", "ccc", "dddd"])
    object_id = vineyard_client.put(arr)
    assert arr.cast(pa.large_string()).equals(vineyard_client.get(object_id))

    arr = pa.array([True, False, True, False])
    object_id = vineyard_client.put(arr)
    assert arr.equals(vineyard_client.get(object_id))

    arr = pa.array([True, False, None, None])
    object_id = vineyard_client.put(arr)
    assert arr.equals(vineyard_client.get(object_id))

    nested_arr = pa.array([[], None, [1, 2], [None, 1]])
    object_id = vineyard_client.put(nested_arr)
    assert vineyard_client.get(object_id).values.equals(nested_arr.values)

Exemple #2

0

Afficher le fichier

    def __arrow_array__(self, type=None):
        offsets = self.indices
        type = type or self.dtype
        if type == pa.string() and self.dtype == pa.large_string():
            offsets = offsets.astype(np.int32)  # downcast
        elif type == pa.large_string() and self.dtype == pa.string():
            type = pa.string()  # upcast

        # this code is very similar to vaex.arrow.convert.trim_buffers
        new_offset = self.offset % 8
        remove_offset = (self.offset // 8) * 8
        offsets = _asnumpy(
            _trim(offsets, remove_offset, self.offset + self.length + 1))
        first_offset = offsets[0]
        last_offset = offsets[new_offset + self.length]
        new_offsets = offsets - first_offset
        if self.null_bitmap is not None:
            null_bitmap, null_offset = _trim_bits(self.null_bitmap,
                                                  self.offset,
                                                  self.offset + self.length)
            assert null_offset == new_offset
            null_bitmap = pa.py_buffer(_asnumpy(null_bitmap))
        else:
            null_bitmap = None
        new_offsets_buffer = pa.py_buffer(new_offsets)
        bytes = pa.py_buffer(
            _asnumpy(_trim(self.bytes, first_offset, last_offset)))
        return pa.Array.from_buffers(type,
                                     self.length,
                                     [null_bitmap, new_offsets_buffer, bytes],
                                     offset=new_offset)

Exemple #3

0

Afficher le fichier

Fichier : compute_test.py Projet : yaotianzhang/vaex

def test_where_large():
    df = vaex.from_arrays(
        s=pa.array(['a', 'b', None, 'd'], type=pa.large_string()))
    assert (df['s'] + df['s']).dtype.internal == pa.large_string()
    expr = df.func.where(df['s'] == 'a', 'A', df['s'])
    assert expr.tolist() == ['A', 'b', None, 'd']
    assert expr.dtype.is_string

Exemple #4

0

Afficher le fichier

Fichier : test_large_string.py Projet : triump2020/duckdb

    def test_large_string_type(self, duckdb_cursor):
        if not can_run:
            return

        schema = pa.schema([("data", pa.large_string())])
        inputs = [pa.array(["foo", "baaaar", "b"], type=pa.large_string())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)

        rel = duckdb.from_arrow(arrow_table)
        res = rel.execute().fetchall()
        assert res == [('foo', ), ('baaaar', ), ('b', )]

Exemple #5

0

Afficher le fichier

 def __arrow_array__(self, type=None):
     indices = self.indices
     type = type or self.dtype
     if type == pa.string() and self.dtype == pa.large_string():
         indices = indices.astype(np.int32)  # downcast
     elif type == pa.large_string() and self.dtype == pa.string():
         type = pa.string()  # upcast
     # TODO: we dealloc the memory in the C++ extension, so we need to copy for now
     buffers = [None, pa.py_buffer(_asnumpy(indices).copy() - self.offset), pa.py_buffer(_asnumpy(self.bytes).view(np.uint8).copy()), ]
     if self.null_bitmap is not None:
         assert self.null_offset == 0 #self.offset
         buffers[0] = pa.py_buffer(self.null_bitmap.copy())
     arrow_array = pa.Array.from_buffers(type, self.length, buffers=buffers)
     return arrow_array

Exemple #6

0

Afficher le fichier

Fichier : arrow_util_test.py Projet : tanguycdls/data-validation

    def testIsBinaryLike(self):
        for t in (pa.binary(), pa.large_binary(), pa.string(),
                  pa.large_string()):
            self.assertTrue(arrow_util.is_binary_like(t))

        for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
            self.assertFalse(arrow_util.is_binary_like(t))

Exemple #7

0

Afficher le fichier

Fichier : encoding.py Projet : songroom2016/vaex

 def decode(encoding, type_spec):
     if type_spec == 'string':
         return pa.string()
     if type_spec == 'large_string':
         return pa.large_string()
     else:
         return np.dtype(type_spec)

Exemple #8

0

Afficher le fichier

Fichier : encoding.py Projet : t-triobox/vaex

 def decode(encoding, type_spec):
     if isinstance(type_spec, dict):
         if type_spec['type'] == 'duration':
             return DataType(pa.duration(type_spec['unit']))
         elif type_spec['type'] == 'timestamp':
             return DataType(pa.timestamp(type_spec['unit']))
         elif type_spec['type'] == 'list':
             sub = encoding.decode('dtype', type_spec['value_type']).arrow
             return DataType(pa.list_(sub))
         elif type_spec['type'] == 'dict':
             value_type = encoding.decode('dtype', type_spec["value_type"]).arrow
             index_type = encoding.decode('dtype', type_spec["index_type"]).arrow
             bool_ordered = type_spec["ordered"]
             return DataType(pa.dictionary(index_type, value_type, bool_ordered))
         else:
             raise ValueError(f'Do not understand type {type_spec}')
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))

Exemple #9

0

Afficher le fichier

def _get_binary_like_byte_size_test_cases():
  result = []
  for array_type, sizeof_offsets in [
      (pa.binary(), 4),
      (pa.string(), 4),
      (pa.large_binary(), 8),
      (pa.large_string(), 8),
  ]:
    result.append(
        dict(
            testcase_name=str(array_type),
            array=pa.array([
                "a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg",
                "hhhhhhhh", "iiiiiiiii"
            ], type=array_type),
            slice_offset=1,
            slice_length=3,
            # contents: 45
            # offsets: 10 * sizeof_offsets
            # null bitmap: 2
            expected_size=(45 + sizeof_offsets * 10 + 2),
            # contents: 9
            # offsets: 4 * sizeof_offsets
            # null bitmap: 1
            expected_sliced_size=(9 + sizeof_offsets * 4 + 1)))
  return result

Exemple #10

0

Afficher le fichier

Fichier : test_types.py Projet : xingzhicn/arrow

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.large_string(), pa.large_binary(),
            pa.list_(pa.int32()), pa.large_list(pa.uint16()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.string()))

Exemple #11

0

Afficher le fichier

    def to_arrow(self, type=None):
        values = [e.values for e in self.expressions]
        chunks = [
            value if isinstance(value, pa.Array) else pa.array(value,
                                                               type=type)
            for value in values
        ]
        types = [chunk.type for chunk in chunks]

        # upcast if mixed types
        if pa.string() in types and pa.large_string() in types:

            def _arrow_string_upcast(array):
                if array.type == pa.large_string():
                    return array
                if array.type == pa.string():
                    import vaex.arrow.convert
                    column = vaex.arrow.convert.column_from_arrow_array(array)
                    column.indices = column.indices.astype(np.int64)
                    return pa.array(column)
                else:
                    raise ValueError('Not a string type: %r' % array)

            chunks = [_arrow_string_upcast(chunk) for chunk in chunks]

        return pa.chunked_array(chunks)

Exemple #12

0

Afficher le fichier

def test_large_binary():
    data = [b'foo', b'bar'] * 50
    for type in [pa.large_binary(), pa.large_string()]:
        arr = pa.array(data, type=type)
        table = pa.Table.from_arrays([arr], names=['strs'])
        for use_dictionary in [False, True]:
            _check_roundtrip(table, use_dictionary=use_dictionary)

Exemple #13

0

Afficher le fichier

    def __init__(self,
                 indices,
                 bytes,
                 length=None,
                 offset=0,
                 string_sequence=None,
                 null_bitmap=None):
        self._string_sequence = string_sequence
        self.indices = indices
        self.offset = offset  # to avoid memory copies in trim
        self.bytes = bytes
        self.length = length if length is not None else len(indices) - 1
        if indices.dtype.kind == 'i' and indices.dtype.itemsize == 8:
            self.dtype = pa.large_string()
        elif indices.dtype.kind == 'i' and indices.dtype.itemsize == 4:
            self.dtype = pa.string()
        else:
            raise ValueError('unsupported index type' + str(indices.dtype))
        self.shape = (self.__len__(), )
        self.nbytes = self.bytes.nbytes + self.indices.nbytes
        self.null_bitmap = null_bitmap

        if not (self.indices.dtype.kind == 'i'
                and self.indices.dtype.itemsize in [4, 8]):
            raise ValueError('unsupported index type' +
                             str(self.indices.dtype))

Exemple #14

0

Afficher le fichier

Fichier : convert.py Projet : vivicai/vaex

def column_from_arrow_array(arrow_array):
    # TODO: we may be able to pass chunked arrays
    arrow_array = ensure_not_chunked(arrow_array)
    arrow_type = arrow_array.type
    buffers = arrow_array.buffers()
    if len(buffers) == 2:
        return numpy_array_from_arrow_array(arrow_array)
    elif len(buffers) == 3 and arrow_array.type in [
            pyarrow.string(), pyarrow.large_string()
    ]:
        bitmap_buffer, offsets, string_bytes = arrow_array.buffers()
        if arrow_array.null_count == 0:
            null_bitmap = None  # we drop any null_bitmap when there are no null counts
        else:
            null_bitmap = np.frombuffer(bitmap_buffer, 'uint8',
                                        len(bitmap_buffer))
        if arrow_array.type == pyarrow.string():
            offsets = np.frombuffer(offsets, np.int32, len(offsets) // 4)
        else:
            offsets = np.frombuffer(offsets, np.int64, len(offsets) // 8)
        if string_bytes is None:
            string_bytes = np.array([], dtype='S1')
        else:
            string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes))
        offset = arrow_array.offset
        column = vaex.column.ColumnStringArrow(offsets,
                                               string_bytes,
                                               len(arrow_array),
                                               offset,
                                               null_bitmap=null_bitmap)
        return column
    else:
        raise TypeError('type unsupported: %r' % arrow_type)

Exemple #15

0

Afficher le fichier

Fichier : convert.py Projet : vivicai/vaex

def trim_buffers(ar):
    # there are cases where memcopy are made, of modifications are mode (large_string_to_string)
    # in those cases, we don't want to work on the full array, and get rid of the offset if possible
    if ar.type == pa.string() or ar.type == pa.large_string():
        if isinstance(ar, pa.ChunkedArray):
            return ar  # lets assume chunked arrays are fine
        null_bitmap, offsets_buffer, bytes = ar.buffers()
        if ar.type == pa.string():
            offsets = np.frombuffer(offsets_buffer, np.int32,
                                    len(ar) + 1 + ar.offset)
        else:
            offsets = np.frombuffer(offsets_buffer, np.int64,
                                    len(ar) + 1 + ar.offset)
        # because it is difficult to slice bits
        new_offset = ar.offset % 8
        remove_offset = (ar.offset // 8) * 8
        first_offset = offsets[remove_offset]
        new_offsets = offsets[remove_offset:] - first_offset
        if null_bitmap:
            null_bitmap = null_bitmap.slice(ar.offset // 8)
        new_offsets_buffer = pa.py_buffer(new_offsets)
        bytes = bytes.slice(first_offset)
        ar = pa.Array.from_buffers(ar.type,
                                   len(ar),
                                   [null_bitmap, new_offsets_buffer, bytes],
                                   offset=new_offset)
    return ar

Exemple #16

0

Afficher le fichier

Fichier : convert.py Projet : stjordanis/vaex

def arrow_string_array_from_buffers(bytes, offsets, null_bitmap):
    if offsets.dtype == np.int32:
        type = pa.string()
    elif offsets.dtype == np.int64:
        type = pa.large_string()
    else:
        raise ValueError(f'Unsupported dtype {offsets.dtype} for string offsets')
    return _arrow_binary_array_from_buffers(bytes, offsets, null_bitmap, type)

Exemple #17

0

Afficher le fichier

Fichier : convert.py Projet : stjordanis/vaex

def same_type(*arrays):
    types = [ar.type for ar in arrays]
    if any(types[0] != type for type in types):
        if vaex.dtype(types[0]) == str:
            # we have mixed large and normal string
            return [large_string_to_string(ar) if ar.type == pa.large_string() else ar for ar in arrays]
        else:
            raise NotImplementedError
    return arrays

Exemple #18

0

Afficher le fichier

def test_large_binary_huge():
    s = b'xy' * 997
    data = [s] * ((1 << 33) // len(s))
    for type in [pa.large_binary(), pa.large_string()]:
        arr = pa.array(data, type=type)
        table = pa.Table.from_arrays([arr], names=['strs'])
        for use_dictionary in [False, True]:
            _check_roundtrip(table, use_dictionary=use_dictionary)
        del arr, table

Exemple #19

0

Afficher le fichier

Fichier : sql_util_test.py Projet : tensorflow/tfx-bsl

 def test_query_with_all_supported_types(self):
     record_batch = pa.RecordBatch.from_arrays([
         pa.array([[1], [2]], type=pa.list_(pa.int32())),
         pa.array([[10], [20]], type=pa.list_(pa.int64())),
         pa.array([[1.1], [2.2]], type=pa.list_(pa.float32())),
         pa.array([[10.1], [20.2]], type=pa.list_(pa.float64())),
         pa.array([['a'], ['b']], type=pa.list_(pa.string())),
         pa.array([['a+'], ['b+']], type=pa.list_(pa.large_string())),
         pa.array([[b'a_bytes'], [b'b_bytes']], type=pa.list_(pa.binary())),
         pa.array([[b'a_bytes+'], [b'b_bytes+']],
                  type=pa.list_(pa.large_binary())),
     ], [
         'int32_list',
         'int64_list',
         'float32_list',
         'float64_list',
         'string_list',
         'large_string_list',
         'binary_list',
         'large_binary_list',
     ])
     sql = """
   SELECT
     ARRAY(
       SELECT
         STRUCT(int32_list, int64_list,
           float32_list, float64_list,
           string_list, large_string_list,
           binary_list, large_binary_list)
       FROM
         example.int32_list,
         example.int64_list,
         example.float32_list,
         example.float64_list,
         example.string_list,
         example.large_string_list,
         example.binary_list,
         example.large_binary_list
     ) as slice_key
   FROM Examples as example;"""
     query = sql_util.RecordBatchSQLSliceQuery(sql, record_batch.schema)
     slices = query.Execute(record_batch)
     self.assertEqual(slices, [[[('int32_list', '1'), ('int64_list', '10'),
                                 ('float32_list', '1.1'),
                                 ('float64_list', '10.1'),
                                 ('string_list', 'a'),
                                 ('large_string_list', 'a+'),
                                 ('binary_list', 'a_bytes'),
                                 ('large_binary_list', 'a_bytes+')]],
                               [[('int32_list', '2'), ('int64_list', '20'),
                                 ('float32_list', '2.2'),
                                 ('float64_list', '20.2'),
                                 ('string_list', 'b'),
                                 ('large_string_list', 'b+'),
                                 ('binary_list', 'b_bytes'),
                                 ('large_binary_list', 'b_bytes+')]]])

Exemple #20

0

Afficher le fichier

Fichier : arrow.py Projet : zhiminch/libvineyard

def string_array_resolver(obj):
    meta = obj.meta
    buffer_data = as_arrow_buffer(obj.member('buffer_data_'))
    buffer_offsets = as_arrow_buffer(obj.member('buffer_offsets_'))
    null_bitmap = as_arrow_buffer(obj.member('null_bitmap_'))
    length = int(meta['length_'])
    null_count = int(meta['null_count_'])
    offset = int(meta['offset_'])
    return pa.lib.Array.from_buffers(pa.large_string(), length, [null_bitmap, buffer_offsets, buffer_data], null_count,
                                     offset)

Exemple #21

0

Afficher le fichier

 def decode(encoding, type_spec):
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))

Exemple #22

0

Afficher le fichier

 def _arrow_string_upcast(array):
     if array.type == pa.large_string():
         return array
     if array.type == pa.string():
         import vaex.arrow.convert
         column = vaex.arrow.convert.column_from_arrow_array(array)
         column.indices = column.indices.astype(np.int64)
         return pa.array(column)
     else:
         raise ValueError('Not a string type: %r' % array)

Exemple #23

0

Afficher le fichier

Fichier : convert_test.py Projet : toydogcat/vaex

def test_large_string_to_string(offset):
    s = pa.array(['aap', 'noot', None, 'mies'] * 3, type=pa.large_string())
    ns = convert.large_string_to_string(s)
    assert s.type != ns.type
    assert s.tolist() == ns.tolist()

    s = s.slice(offset)
    ns = convert.large_string_to_string(s)
    assert ns.offset < 8
    assert s.tolist() == ns.tolist()

Exemple #24

0

Afficher le fichier

def _resize_arrow_type(t):
    if t == pa.string():
        return pa.large_string()
    if t == pa.utf8():
        return pa.large_utf8()
    if t == pa.binary():
        return pa.large_binary()
    if isinstance(t, pa.lib.ListType):
        return pa.large_list(t.value_type)
    return t

Exemple #25

0

Afficher le fichier

Fichier : convert_test.py Projet : vivicai/vaex

def test_large_string_to_string(offset, chunked):
    s = pa.array(['aap', 'noot', None, 'mies'] * 3, type=pa.large_string())
    if chunked:
        s = pa.chunked_array([s.slice(0, 5), s.slice(5)])
    ns = convert.large_string_to_string(s)
    ns.validate()
    assert s.type != ns.type
    assert s.to_pylist() == ns.to_pylist()

    s = s.slice(offset)
    ns = convert.large_string_to_string(s)
    if not chunked:
        assert ns.offset < 8
    assert s.to_pylist() == ns.to_pylist()

Exemple #26

0

Afficher le fichier

Fichier : _arrow_flight_service.py Projet : chipkent/deephaven-core

def _map_arrow_type(arrow_type):
    arrow_to_dh = {
        pa.null(): '',
        pa.bool_(): '',
        pa.int8(): 'byte',
        pa.int16(): 'short',
        pa.int32(): 'int',
        pa.int64(): 'long',
        pa.uint8(): '',
        pa.uint16(): 'char',
        pa.uint32(): '',
        pa.uint64(): '',
        pa.float16(): '',
        pa.float32(): 'float',
        pa.float64(): 'double',
        pa.time32('s'): '',
        pa.time32('ms'): '',
        pa.time64('us'): '',
        pa.time64('ns'): 'io.deephaven.time.DateTime',
        pa.timestamp('us', tz=None): '',
        pa.timestamp('ns', tz=None): '',
        pa.date32(): 'java.time.LocalDate',
        pa.date64(): 'java.time.LocalDate',
        pa.binary(): '',
        pa.string(): 'java.lang.String',
        pa.utf8(): 'java.lang.String',
        pa.large_binary(): '',
        pa.large_string(): '',
        pa.large_utf8(): '',
        # decimal128(int precision, int scale=0)
        # list_(value_type, int list_size=-1)
        # large_list(value_type)
        # map_(key_type, item_type[, keys_sorted])
        # struct(fields)
        # dictionary(index_type, value_type, …)
        # field(name, type, bool nullable = True[, metadata])
        # schema(fields[, metadata])
        # from_numpy_dtype(dtype)
    }

    dh_type = arrow_to_dh.get(arrow_type)
    if not dh_type:
        # if this is a case of timestamp with tz specified
        if isinstance(arrow_type, pa.TimestampType):
            dh_type = "io.deephaven.time.DateTime"

    if not dh_type:
        raise DHError(f'unsupported arrow data type : {arrow_type}')

    return {"deephaven:type": dh_type}

Exemple #27

0

Afficher le fichier

Fichier : test_types.py Projet : wiltonlazary/arrow-1

def test_is_binary_string():
    assert types.is_binary(pa.binary())
    assert not types.is_binary(pa.string())
    assert not types.is_binary(pa.large_binary())
    assert not types.is_binary(pa.large_string())

    assert types.is_string(pa.string())
    assert types.is_unicode(pa.string())
    assert not types.is_string(pa.binary())
    assert not types.is_string(pa.large_string())
    assert not types.is_string(pa.large_binary())

    assert types.is_large_binary(pa.large_binary())
    assert not types.is_large_binary(pa.large_string())
    assert not types.is_large_binary(pa.binary())
    assert not types.is_large_binary(pa.string())

    assert types.is_large_string(pa.large_string())
    assert not types.is_large_string(pa.large_binary())
    assert not types.is_large_string(pa.string())
    assert not types.is_large_string(pa.binary())

    assert types.is_fixed_size_binary(pa.binary(5))
    assert not types.is_fixed_size_binary(pa.binary())

Exemple #28

0

Afficher le fichier

 def decode(encoding, type_spec):
     if isinstance(type_spec, dict):
         if type_spec['type'] == 'list':
             sub = encoding.decode('dtype', type_spec['value_type']).arrow
             return DataType(pa.list_(sub))
         else:
             raise ValueError(f'Do not understand type {type_spec}')
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))

Exemple #29

0

Afficher le fichier

    def json_type_to_pyarrow_type(
        typ: str,
        reverse: bool = False,
        logger: AirbyteLogger = AirbyteLogger()) -> str:
        """
        Converts Json Type to PyArrow types to (or the other way around if reverse=True)

        :param typ: Json type if reverse is False, else PyArrow type
        :param reverse: switch to True for PyArrow type -> Json type, defaults to False
        :param logger: defaults to AirbyteLogger()
        :return: PyArrow type if reverse is False, else Json type
        """
        str_typ = str(typ)
        # this is a map of airbyte types to pyarrow types. The first list element of the pyarrow types should be the one to use where required.
        map = {
            "boolean": ("bool_", "bool"),
            "integer": ("int64", "int8", "int16", "int32", "uint8", "uint16",
                        "uint32", "uint64"),
            "number": ("float64", "float16", "float32", "decimal128",
                       "decimal256", "halffloat", "float", "double"),
            "string": ("large_string", "string"),
            # TODO: support object type rather than coercing to string
            "object": ("large_string", ),
            # TODO: support array type rather than coercing to string
            "array": ("large_string", ),
            "null": ("large_string", ),
        }
        if not reverse:
            for json_type, pyarrow_types in map.items():
                if str_typ.lower() == json_type:
                    return str(
                        getattr(pa, pyarrow_types[0]).__call__()
                    )  # better way might be necessary when we decide to handle more type complexity
            logger.debug(
                f"JSON type '{str_typ}' is not mapped, falling back to default conversion to large_string"
            )
            return str(pa.large_string())
        else:
            for json_type, pyarrow_types in map.items():
                if any(
                        str_typ.startswith(pa_type)
                        for pa_type in pyarrow_types):
                    return json_type
            logger.debug(
                f"PyArrow type '{str_typ}' is not mapped, falling back to default conversion to string"
            )
            return "string"  # default type if unspecified in map

Exemple #30

0

Afficher le fichier

Fichier : test_scalars.py Projet : SofyanS/CH_redact

    def test_large_string_unicode(self):
        arr = pa.array([u'foo', None, u'mañana'], type=pa.large_string())

        v = arr[0]
        assert isinstance(v, pa.LargeStringValue)
        assert v.as_py() == u'foo'
        assert repr(v) == repr(u"foo")
        assert str(v) == str(u"foo")
        assert v == u'foo'
        # Assert that newly created values are equal to the previously created
        # one.
        assert v == arr[0]

        assert arr[1] is pa.NA

        v = arr[2].as_py()
        assert v == u'mañana'
        assert isinstance(v, unicode_type)