def test_type_comparisons(): val = pa.int32() assert val == pa.int32() assert val == 'int32' with pytest.raises(TypeError): val == 5
def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([None])
def test_field_add_remove_metadata(): import collections f0 = pa.field('foo', pa.int32()) assert f0.metadata is None metadata = {b'foo': b'bar', b'pandas': b'badger'} metadata2 = collections.OrderedDict([ (b'a', b'alpha'), (b'b', b'beta') ]) f1 = f0.add_metadata(metadata) assert f1.metadata == metadata f2 = f0.add_metadata(metadata2) assert f2.metadata == metadata2 with pytest.raises(TypeError): f0.add_metadata([1, 2, 3]) f3 = f1.remove_metadata() assert f3.metadata is None # idempotent f4 = f3.remove_metadata() assert f4.metadata is None f5 = pa.field('foo', pa.int32(), True, metadata) f6 = f0.add_metadata(metadata) assert f5.equals(f6)
def test_table_unsafe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): table.cast(target_schema) casted_table = table.cast(target_schema, safe=False) assert casted_table.equals(expected_table)
def test_table_safe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) casted_table = table.cast(target_schema) assert casted_table.equals(expected_table)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_is_union(): for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], mode=mode)) assert not types.is_union(pa.list_(pa.int32()))
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_array_eq_raises(): # ARROW-2150: we are raising when comparing arrays until we define the # behavior to either be elementwise comparisons or data equality arr1 = pa.array([1, 2, 3], type=pa.int32()) arr2 = pa.array([1, 2, 3], type=pa.int32()) with pytest.raises(NotImplementedError): arr1 == arr2
def test_fields_hashable(): in_dict = {} fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True
def test_struct_type(): fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields for a, b in zip(ty, fields): a == b
def test_floating_point_truncate_safe(): safe_cases = [ (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32', np.array([1, 2, 3], dtype='i4'), pa.int32()), (np.array([1.0, 2.0, 3.0], dtype='float64'), 'float64', np.array([1, 2, 3], dtype='i4'), pa.int32()), (np.array([-10.0, 20.0, -30.0], dtype='float64'), 'float64', np.array([-10, 20, -30], dtype='i4'), pa.int32()), ] for case in safe_cases: _check_cast_case(case, safe=True)
def test_is_nested_or_struct(): struct_ex = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) assert types.is_struct(struct_ex) assert not types.is_struct(pa.list_(pa.int32())) assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) assert not types.is_nested(pa.int32())
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty0.index_type == pa.int32() assert isinstance(ty0.dictionary, pa.Array) assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True
def test_schema_equals_propagates_check_metadata(): # ARROW-4088 schema1 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()) ]) schema2 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string(), metadata={'a': 'alpha'}), ]) assert not schema1.equals(schema2) assert schema1.equals(schema2, check_metadata=False)
def test_nested_lists(seq): data = [[], [1, 2], None] arr = pa.array(seq(data)) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data # With explicit type arr = pa.array(seq(data), type=pa.list_(pa.int32())) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int32()) assert arr.to_pylist() == data
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.string()) assert ty0.index_type == pa.int32() assert ty0.value_type == pa.string() assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) assert ty1.index_type == pa.int8() assert ty1.value_type == pa.float64() assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', 'string') assert ty2.index_type == pa.int8() assert ty2.value_type == pa.string() assert ty2.ordered is False
def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), (6, 'bar', False)] expected = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values data = [(5, 'foo', None), None, (6, None, False)] expected = [{'a': 5, 'b': 'foo', 'c': None}, None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([tup], type=ty)
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_recordbatch_basics(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1']) assert not batch.schema.metadata assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) assert batch.to_pydict() == OrderedDict([ ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, 5, 10]) ]) with pytest.raises(IndexError): # bounds checking batch[2] # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) assert batch.schema == schema
def test_cast_integers_safe(): safe_cases = [ (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) ] for case in safe_cases: _check_cast_case(case) unsafe_cases = [ (np.array([50000], dtype='i4'), 'int32', 'int16'), (np.array([70000], dtype='i4'), 'int32', 'uint16'), (np.array([-1], dtype='i4'), 'int32', 'uint16'), (np.array([50000], dtype='u2'), 'uint16', 'int16') ] for in_data, in_type, out_type in unsafe_cases: in_arr = pa.array(in_data, type=in_type) with pytest.raises(pa.ArrowInvalid): in_arr.cast(out_type)
def test_buffer_lifetime(self): # ARROW-2195 arr = pa.array([1, 12, 23, 3, 34], pa.int32()) batch = pa.RecordBatch.from_arrays([arr], ['field1']) # Serialize RecordBatch into Plasma store sink = pa.MockOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() object_id = random_object_id() data_buffer = self.plasma_client.create(object_id, sink.size()) stream = pa.FixedSizeBufferWriter(data_buffer) writer = pa.RecordBatchStreamWriter(stream, batch.schema) writer.write_batch(batch) writer.close() self.plasma_client.seal(object_id) del data_buffer # Unserialize RecordBatch from Plasma store [data_buffer] = self.plasma_client2.get_buffers([object_id]) reader = pa.RecordBatchStreamReader(data_buffer) read_batch = reader.read_next_batch() # Lose reference to returned buffer. The RecordBatch must still # be backed by valid memory. del data_buffer, reader assert read_batch.equals(batch)
def _from_jvm_int_type(jvm_type): """ Convert a JVM int type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int Returns ------- typ: pyarrow.DataType """ if jvm_type.isSigned: if jvm_type.bitWidth == 8: return pa.int8() elif jvm_type.bitWidth == 16: return pa.int16() elif jvm_type.bitWidth == 32: return pa.int32() elif jvm_type.bitWidth == 64: return pa.int64() else: if jvm_type.bitWidth == 8: return pa.uint8() elif jvm_type.bitWidth == 16: return pa.uint16() elif jvm_type.bitWidth == 32: return pa.uint32() elif jvm_type.bitWidth == 64: return pa.uint64()
def test_floating_point_truncate_unsafe(): unsafe_cases = [ (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32', np.array([1, 2, 3], dtype='i4'), pa.int32()), (np.array([1.1, 2.2, 3.3], dtype='float64'), 'float64', np.array([1, 2, 3], dtype='i4'), pa.int32()), (np.array([-10.1, 20.2, -30.3], dtype='float64'), 'float64', np.array([-10, 20, -30], dtype='i4'), pa.int32()), ] for case in unsafe_cases: # test safe casting raises with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): _check_cast_case(case, safe=True) # test unsafe casting truncates _check_cast_case(case, safe=False)
def test_type_list(): value_type = pa.int32() list_type = pa.list_(value_type) assert str(list_type) == 'list<item: int32>' field = pa.field('my_item', pa.string()) l2 = pa.list_(field) assert str(l2) == 'list<my_item: string>'
def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), {'a': 6, 'b': 'bar', 'c': False}] with pytest.raises(TypeError): pa.array(data, type=ty)
def do_get(self, ticket): data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())] data2 = [pa.array([-10.0, -5.0, 0.0, 5.0, 10.0], type=pa.float64())] assert data1.type != data2.type table1 = pa.Table.from_arrays(data1, names=['a']) table2 = pa.Table.from_arrays(data2, names=['a']) assert table1.schema == self.schema return flight.GeneratorStream(self.schema, [table1, table2])
def test_field_flatten(): f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'}) assert f0.flatten() == [f0] f1 = pa.field('bar', pa.float64(), nullable=False) ff = pa.field('ff', pa.struct([f0, f1]), nullable=False) assert ff.flatten() == [ pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}), pa.field('ff.bar', pa.float64(), nullable=False)] # XXX # Nullable parent makes flattened child nullable ff = pa.field('ff', pa.struct([f0, f1])) assert ff.flatten() == [ pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}), pa.field('ff.bar', pa.float64())] fff = pa.field('fff', pa.struct([ff])) assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))]
def test_is_floating(): for t in [pa.float16(), pa.float32(), pa.float64()]: assert types.is_floating(t) assert not types.is_floating(pa.int32())
pq.write_table(table, tempdir / 'test_metadata_segfault.parquet') parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet') parquet_file.metadata.row_group(0).column(0).statistics @pytest.mark.pandas @pytest.mark.parametrize( ('data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count', 'num_values', 'distinct_count'), [ ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), ([-1.1, 2.2, 2.3, None, 4.4 ], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0), ([-1.1, 2.2, 2.3, None, 4.4 ], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0), (['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0), ([True, False, False, True, True ], pa.bool_(), 'BOOLEAN', False, True, 0, 5, 0), ([b'\x00', b'b', b'12', None, b'aaa' ], pa.binary(), 'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0), ]) def test_parquet_column_statistics_api(data, type, physical_type, min_value, max_value, null_count, num_values, distinct_count):
def pyarrow_types_from_pandas( df: pd.DataFrame, index: bool, ignore_cols: Optional[List[str]] = None, index_left: bool = False) -> Dict[str, pa.DataType]: """Extract the related Pyarrow data types from any Pandas DataFrame.""" # Handle exception data types (e.g. Int64, Int32, string) ignore_cols = [] if ignore_cols is None else ignore_cols cols: List[str] = [] cols_dtypes: Dict[str, Optional[pa.DataType]] = {} for name, dtype in df.dtypes.to_dict().items(): dtype = str(dtype) if name in ignore_cols: cols_dtypes[name] = None elif dtype == "Int8": cols_dtypes[name] = pa.int8() elif dtype == "Int16": cols_dtypes[name] = pa.int16() elif dtype == "Int32": cols_dtypes[name] = pa.int32() elif dtype == "Int64": cols_dtypes[name] = pa.int64() elif dtype == "string": cols_dtypes[name] = pa.string() else: cols.append(name) # Filling cols_dtypes for col in cols: _logger.debug("Inferring PyArrow type from column: %s", col) try: schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]], preserve_index=False) except pa.ArrowInvalid as ex: cols_dtypes[col] = process_not_inferred_dtype(ex) except TypeError as ex: msg = str(ex) if " is required (got type " in msg: raise TypeError( f"The {col} columns has a too generic data type ({df[col].dtype}) and seems " f"to have mixed data types ({msg}). " "Please, cast this columns with a more deterministic data type " f"(e.g. df['{col}'] = df['{col}'].astype('string')) or " "pass the column schema as argument for AWS Data Wrangler " f"(e.g. dtype={{'{col}': 'string'}}") from ex raise else: cols_dtypes[col] = schema.field(col).type # Filling indexes indexes: List[str] = [] if index is True: for field in pa.Schema.from_pandas(df=df[[]], preserve_index=True): name = str(field.name) _logger.debug("Inferring PyArrow type from index: %s", name) cols_dtypes[name] = field.type indexes.append(name) # Merging Index sorted_cols: List[str] = indexes + list( df.columns) if index_left is True else list(df.columns) + indexes # Filling schema columns_types: Dict[str, pa.DataType] columns_types = {n: cols_dtypes[n] for n in sorted_cols} _logger.debug("columns_types: %s", columns_types) return columns_types
def test_type_comparisons(): val = pa.int32() assert val == pa.int32() assert val == 'int32' assert val != 5
def test_field_id_metadata(): # ARROW-7080 field_id = b'PARQUET:field_id' inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'}) middle = pa.field('middle', pa.struct([inner]), metadata={field_id: b'101'}) fields = [ pa.field('basic', pa.int32(), metadata={ b'other': b'abc', field_id: b'1' }), pa.field('list', pa.list_( pa.field('list-inner', pa.int32(), metadata={field_id: b'10'})), metadata={field_id: b'11'}), pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}), pa.field('no-metadata', pa.int32()), pa.field('non-integral-field-id', pa.int32(), metadata={field_id: b'xyz'}), pa.field('negative-field-id', pa.int32(), metadata={field_id: b'-1000'}) ] arrs = [[] for _ in fields] table = pa.table(arrs, schema=pa.schema(fields)) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() pf = pq.ParquetFile(pa.BufferReader(contents)) schema = pf.schema_arrow assert schema[0].metadata[field_id] == b'1' assert schema[0].metadata[b'other'] == b'abc' list_field = schema[1] assert list_field.metadata[field_id] == b'11' list_item_field = list_field.type.value_field assert list_item_field.metadata[field_id] == b'10' struct_field = schema[2] assert struct_field.metadata[field_id] == b'102' struct_middle_field = struct_field.type[0] assert struct_middle_field.metadata[field_id] == b'101' struct_inner_field = struct_middle_field.type[0] assert struct_inner_field.metadata[field_id] == b'100' assert schema[3].metadata is None # Invalid input is passed through (ok) but does not # have field_id in parquet (not tested) assert schema[4].metadata[field_id] == b'xyz' assert schema[5].metadata[field_id] == b'-1000'
def test_is_dictionary(): assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string())) assert not types.is_dictionary(pa.int32())
def test_schema_pyarrow_types(): field_name = "column1" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "int", "bitWidth": 8, "isSigned": True }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.int8() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_no_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "timestamp" }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("ns") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_with_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "timestamp", "unit": "MICROSECOND" }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("us") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "date_with_day_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "date", "unit": "DAY" }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.date32() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "simple_list" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "list" }, "children": [{ "type": { "name": "int", "bitWidth": 32, "isSigned": True } }], }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.list_( pyarrow.field("element", pyarrow.int32())) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "dictionary" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "int", "bitWidth": 32, "isSigned": True }, "children": [], "dictionary": { "id": 0, "indexType": { "name": "int", "bitWidth": 16, "isSigned": True }, }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "struct_array" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "list" }, "children": [], "dictionary": { "id": 0, "indexType": { "name": "int", "bitWidth": 32, "isSigned": True }, }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_( pyarrow.int32(), pyarrow.list_( pyarrow.field( "element", pyarrow.struct( [pyarrow.field("val", pyarrow.int32(), False, metadata)]), )), ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "simple_dictionary" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "metadata": { "metadata_k": "metadata_v" }, "nullable": False, "type": { "name": "dictionary" }, "dictionary": { "indexType": { "type": { "name": "int", "bitWidth": 8 } } }, "children": [{ "type": { "name": "int", "bitWidth": 32 } }], }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "type": { "name": "struct" }, "children": [{ "name": "x", "type": { "name": "int", "bitWidth": 64 }, "nullable": True, "metadata": {}, }], "metadata": { "metadata_k": "metadata_v" }, "nullable": False, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.struct( [pyarrow.field("x", pyarrow.int64(), True, {})]) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False
"instrument_id": pa.dictionary(pa.int64(), pa.string()), "bid": pa.string(), "ask": pa.string(), "bid_size": pa.string(), "ask_size": pa.string(), "last": pa.string(), "ts_event": pa.int64(), "ts_init": pa.int64(), }), BinanceBar: pa.schema({ "bar_type": pa.dictionary(pa.int8(), pa.string()), "instrument_id": pa.dictionary(pa.int64(), pa.string()), "open": pa.string(), "high": pa.string(), "low": pa.string(), "close": pa.string(), "volume": pa.string(), "quote_volume": pa.string(), "count": pa.int32(), "taker_buy_base_volume": pa.string(), "taker_buy_quote_volume": pa.string(), "ts_event": pa.int64(), "ts_init": pa.int64(), }), } # default schemas for cls, schema in NAUTILUS_PARQUET_SCHEMA.items(): register_parquet(cls, schema=schema)
pa.field('b', pa.int32(), nullable=False) ] for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i @pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64)]) def test_exact_primitive_types(t, check_func): assert check_func(t) def test_type_id(): # enum values are not exposed publicly for ty in get_many_types():
def test_sql(parameters, db_type): df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") wr.db.to_sql( df=df, con=engine, name="test_sql", schema=parameters[db_type]["schema"], if_exists="replace", index=False, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=parameters[db_type]["host"], port=parameters[db_type]["port"], database=parameters[db_type]["database"], user=parameters["user"], password=parameters["password"], ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name="test_sql", schema=parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table="test_sql", schema=schema, index_col="index") assert len(df.index) == 3 assert len(df.columns) == 1
def test_is_dictionary(): assert types.is_dictionary( pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))) assert not types.is_dictionary(pa.int32())
import numpy as np import pandas as pd import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType import cudf from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar _NA_REP = "<NA>" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), np.int64: pa.int64(), np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(),
def test_iterate_over_timestamp_tz_chunk(): random.seed(datetime.datetime.now()) scale = random.randint(0, 9) column_meta = [{ "byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale) }, { "byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale) }] type1 = pyarrow.struct([ pyarrow.field('epoch', pyarrow.int64()), pyarrow.field('timezone', pyarrow.int32()), pyarrow.field('fraction', pyarrow.int32()) ]) type2 = pyarrow.struct([ pyarrow.field('epoch', pyarrow.int64()), pyarrow.field('timezone', pyarrow.int32()) ]) data_type = type1 if scale > 3 else type2 def timestamp_tz_generator(scale): epoch = random.randint(-621355968, 2534023007) frac = random.randint(0, 10**scale - 1) * (10**( 9 - scale)) if scale > 3 else random.randint(0, 10**scale - 1) timezone = random.randint(1, 2879) if scale > 3: return {'epoch': epoch, 'timezone': timezone, 'fraction': frac} else: epoch = str(epoch) frac = str(frac) ZEROFILL = '000000000' frac = ZEROFILL[:scale - len(frac)] + frac return { 'epoch': int(epoch + frac) if scale else int(epoch), 'timezone': timezone } def expected_data_transform_tz(_scale): def expected_data_transform_tz_impl(data, scale=_scale): timezone = data['timezone'] tzinfo = _generate_tzinfo_from_tzoffset(timezone - 1440) epoch = data['epoch'] if scale > 3: frac = data['fraction'] if epoch < 0: epoch += 1 frac = 10**9 - frac frac = str(int(frac / 10**(9 - scale))) ZERO_FILL = '000000000' frac = ZERO_FILL[:scale - len(frac)] + frac epoch = int(str(epoch) + frac) microsec = str(epoch) if scale > 6: microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6] else: microsec = microsec[:-scale] + "." + microsec[ -scale:] if scale else microsec if platform.system() == 'Windows': t = datetime.datetime.utcfromtimestamp(0) + datetime.timedelta( seconds=(float(microsec))) if pytz.utc != tzinfo: t += tzinfo.utcoffset(t) return t.replace(tzinfo=tzinfo) else: return datetime.datetime.fromtimestamp(float(microsec), tz=tzinfo) return expected_data_transform_tz_impl iterate_over_test_chunk([data_type, data_type], column_meta, lambda: timestamp_tz_generator(scale), expected_data_transform_tz(scale))
def test_iterate_over_timestamp_ltz_chunk(): random.seed(datetime.datetime.now()) scale = random.randint(0, 9) column_meta = [{ "logicalType": "TIMESTAMP_LTZ", "scale": str(scale) }, { "logicalType": "TIMESTAMP_LTZ", "scale": str(scale) }] data_type = pyarrow.struct([ pyarrow.field('epoch', pyarrow.int64()), pyarrow.field('fraction', pyarrow.int32()) ]) if scale > 7 else pyarrow.int64() def timestamp_ltz_generator(scale): epoch = random.randint(-621355968, 2534023007) frac = random.randint(0, 10**scale - 1) * (10**( 9 - scale)) if scale > 7 else random.randint(0, 10**scale - 1) if scale > 7: return {'epoch': epoch, 'fraction': frac} else: epoch = str(epoch) frac = str(frac) ZEROFILL = '000000000' frac = ZEROFILL[:scale - len(frac)] + frac return int(epoch + frac) if scale else int(epoch) def expected_data_transform_ltz(_scale): def expected_data_transform_ltz_impl(data, scale=_scale): tzinfo = get_timezone( ) # can put a string parameter here in the future if scale > 7: frac = data['fraction'] epoch = data['epoch'] if epoch < 0: epoch += 1 frac = 10**9 - frac frac = str(int(frac / 10**(9 - scale))) ZERO_FILL = '000000000' frac = ZERO_FILL[:scale - len(frac)] + frac data = int(str(epoch) + frac) microsec = str(data) if scale > 6: microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6] else: microsec = microsec[:-scale] + "." + microsec[ -scale:] if scale else microsec if platform.system() == 'Windows': t0 = datetime.datetime.utcfromtimestamp( 0) + datetime.timedelta(seconds=(float(microsec))) return pytz.utc.localize(t0, is_dst=False).astimezone(tzinfo) else: return datetime.datetime.fromtimestamp(float(microsec), tz=tzinfo) return expected_data_transform_ltz_impl iterate_over_test_chunk([data_type, data_type], column_meta, lambda: timestamp_ltz_generator(scale), expected_data_transform_ltz(scale))
def test_is_primitive(): assert types.is_primitive(pa.int32()) assert not types.is_primitive(pa.list_(pa.int32()))
def test_is_decimal(): assert types.is_decimal(pa.decimal128(19, 4)) assert not types.is_decimal(pa.int32())
import six _python_type_map = { pa.null().id: six.text_type, pa.bool_().id: bool, pa.int8().id: int, pa.uint8().id: int, pa.int16().id: int, pa.uint16().id: int, pa.int32().id: int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date,
def test_is_null(): assert types.is_null(pa.null()) assert not types.is_null(pa.list_(pa.int32()))
def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table arr1 = pa.array([[], []], pa.list_(pa.int32())) table = pa.Table.from_arrays([arr1], ['list(int32)']) _check_roundtrip(table)
@pytest.fixture(scope="session") def arrow_table(): return pa.Table.from_pydict({ "col_int": [0, 1, 2], "col_float": [0.0, 1.0, 2.0] }) @require_tf @pytest.mark.parametrize( "cast_schema", [ None, [("col_int", pa.int64()), ("col_float", pa.float64())], [("col_int", pa.int32()), ("col_float", pa.float64())], [("col_int", pa.int64()), ("col_float", pa.float32())], ], ) def test_tf_formatter_sets_default_dtypes(cast_schema, arrow_table): import tensorflow as tf from datasets.formatting import TFFormatter if cast_schema: arrow_table = arrow_table.cast(pa.schema(cast_schema)) arrow_table_dict = arrow_table.to_pydict() list_int = arrow_table_dict["col_int"] list_float = arrow_table_dict["col_float"] formatter = TFFormatter()
def test_empty_lists_table_roundtrip(use_legacy_dataset): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
def test_fields_weakrefable(): field = pa.field('a', pa.int32()) wr = weakref.ref(field) assert wr() is not None del field assert wr() is None
def ArrowSchema(self): return pa.schema( [pa.field(c, pa.list_(pa.int32())) for c in self._columns])
import pyarrow as pa # TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E)) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) large_binary_type = st.just(pa.large_binary()) large_string_type = st.just(pa.large_string()) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'),
except: CYTHON = False try: import pyarrow as pa from pyarrow import csv import numpy as np ARROW = True except: ARROW = False else: sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } __version__ = '3.0.0' PROTOCOL_VERSION = 7 BUFFER_SIZE = 100 * int(1e6) # For setting auto-flushing on netrwork insert ROWS_PER_FLUSH = 100000 DEFAULT_CHUNKSIZE = 0 # Dummy variable for some jsons
def csv_to_table(self, csv_path, table_name, read=None, parse=None, convert=None, con=None, auto_infer=False): ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html ' if not ARROW: return "Optional pyarrow dependency not found. To install: pip3 install pyarrow" sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } start = time.time() # Get table metadata con = con or self con.execute(f'select * from {table_name} where 1=0') # Map column names to pyarrow types and set Arrow's CSV parameters sqream_col_types = [col_type[0] for col_type in con.col_type_tups] column_types = zip( con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups]) read = read or csv.ReadOptions(column_names=con.col_names) parse = parse or csv.ParseOptions(delimiter='|') convert = convert or csv.ConvertOptions( column_types=None if auto_infer else column_types) # Read CSV to in-memory arrow format csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks() num_chunks = len(csv_arrow[0].chunks) numpy_cols = [] # For each column, get the numpy representation for quick packing for col_type, col in zip(sqream_col_types, csv_arrow): # Only one chunk after combine_chunks() col = col.chunks[0] if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'): col = col.to_pandas() else: col = col.to_numpy() numpy_cols.append(col) print(f'total loading csv: {time.time()-start}') start = time.time() # Insert columns into SQream col_num = csv_arrow.shape[1] con.executemany( f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols) print(f'total inserting csv: {time.time()-start}')
def test_is_list(): assert types.is_list(pa.list_(pa.int32())) assert not types.is_list(pa.int32())
# them using Java code as well as enables us to define them as parameters # without to invoke the JVM. # # The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize( 'pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
import pytest from pyarrow.compat import unittest, u # noqa import pyarrow as pa import collections import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int64()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint64()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() def check_struct_type(ty, expected):
def test_iterate_over_decimal_chunk(): random.seed(datetime.datetime.now()) precision = random.randint(1, 38) scale = random.randint(0, precision) datatype = None if precision <= 2: datatype = pyarrow.int8() elif precision <= 4: datatype = pyarrow.int16() elif precision <= 9: datatype = pyarrow.int32() elif precision <= 19: datatype = pyarrow.int64() else: datatype = pyarrow.decimal128(precision, scale) def decimal_generator(_precision, _scale): def decimal128_generator(precision, scale): data = [] for i in range(precision): data.append(str(random.randint(0, 9))) if scale: data.insert(-scale, '.') return decimal.Decimal("".join(data)) def int64_generator(precision): data = random.randint(-9223372036854775808, 9223372036854775807) return int(str(data)[:precision if data >= 0 else precision + 1]) def int32_generator(precision): data = random.randint(-2147483648, 2147483637) return int(str(data)[:precision if data >= 0 else precision + 1]) def int16_generator(precision): data = random.randint(-32768, 32767) return int(str(data)[:precision if data >= 0 else precision + 1]) def int8_generator(precision): data = random.randint(-128, 127) return int(str(data)[:precision if data >= 0 else precision + 1]) if _precision <= 2: return int8_generator(_precision) elif _precision <= 4: return int16_generator(_precision) elif _precision <= 9: return int32_generator(_precision) elif _precision <= 19: return int64_generator(_precision) else: return decimal128_generator(_precision, _scale) def expected_data_transform_decimal(_precision, _scale): def expected_data_transform_decimal_impl(data, precision=_precision, scale=_scale): if precision <= 19: return decimal.Decimal(data).scaleb(-scale) else: return data return expected_data_transform_decimal_impl column_meta = { "logicalType": "FIXED", "precision": str(precision), "scale": str(scale) } iterate_over_test_chunk([datatype, datatype], [column_meta, column_meta], lambda: decimal_generator(precision, scale), expected_data_transform_decimal(precision, scale))