Esempio n. 1
0
def test_type_comparisons():
    val = pa.int32()
    assert val == pa.int32()
    assert val == 'int32'

    with pytest.raises(TypeError):
        val == 5
Esempio n. 2
0
def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])
Esempio n. 3
0
def test_field_add_remove_metadata():
    import collections

    f0 = pa.field('foo', pa.int32())

    assert f0.metadata is None

    metadata = {b'foo': b'bar', b'pandas': b'badger'}
    metadata2 = collections.OrderedDict([
        (b'a', b'alpha'),
        (b'b', b'beta')
    ])

    f1 = f0.add_metadata(metadata)
    assert f1.metadata == metadata

    f2 = f0.add_metadata(metadata2)
    assert f2.metadata == metadata2

    with pytest.raises(TypeError):
        f0.add_metadata([1, 2, 3])

    f3 = f1.remove_metadata()
    assert f3.metadata is None

    # idempotent
    f4 = f3.remove_metadata()
    assert f4.metadata is None

    f5 = pa.field('foo', pa.int32(), True, metadata)
    f6 = f0.add_metadata(metadata)
    assert f5.equals(f6)
Esempio n. 4
0
def test_table_unsafe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        table.cast(target_schema)

    casted_table = table.cast(target_schema, safe=False)
    assert casted_table.equals(expected_table)
Esempio n. 5
0
def test_table_safe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])
    casted_table = table.cast(target_schema)

    assert casted_table.equals(expected_table)
Esempio n. 6
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Esempio n. 7
0
def test_is_union():
    for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]:
        assert types.is_union(pa.union([pa.field('a', pa.int32()),
                                        pa.field('b', pa.int8()),
                                        pa.field('c', pa.string())],
                                       mode=mode))
    assert not types.is_union(pa.list_(pa.int32()))
Esempio n. 8
0
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
Esempio n. 9
0
def test_array_eq_raises():
    # ARROW-2150: we are raising when comparing arrays until we define the
    # behavior to either be elementwise comparisons or data equality
    arr1 = pa.array([1, 2, 3], type=pa.int32())
    arr2 = pa.array([1, 2, 3], type=pa.int32())

    with pytest.raises(NotImplementedError):
        arr1 == arr2
Esempio n. 10
0
def test_fields_hashable():
    in_dict = {}
    fields = [pa.field('a', pa.int64()),
              pa.field('a', pa.int32()),
              pa.field('b', pa.int32())]
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i
Esempio n. 11
0
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    assert opts.check_utf8 is True
    opts.check_utf8 = False
    assert opts.check_utf8 is False

    assert opts.strings_can_be_null is False
    opts.strings_can_be_null = True
    assert opts.strings_can_be_null is True

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    opts = cls(check_utf8=False, column_types={'a': pa.null()},
               null_values=['N', 'nn'], true_values=['T', 'tt'],
               false_values=['F', 'ff'], strings_can_be_null=True)
    assert opts.check_utf8 is False
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.strings_can_be_null is True
Esempio n. 12
0
def test_struct_type():
    fields = [pa.field('a', pa.int64()),
              pa.field('a', pa.int32()),
              pa.field('b', pa.int32())]
    ty = pa.struct(fields)

    assert len(ty) == ty.num_children == 3
    assert list(ty) == fields

    for a, b in zip(ty, fields):
        a == b
Esempio n. 13
0
def test_floating_point_truncate_safe():
    safe_cases = [
        (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32',
         np.array([1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([1.0, 2.0, 3.0], dtype='float64'), 'float64',
         np.array([1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([-10.0, 20.0, -30.0], dtype='float64'), 'float64',
         np.array([-10, 20, -30], dtype='i4'), pa.int32()),
    ]
    for case in safe_cases:
        _check_cast_case(case, safe=True)
Esempio n. 14
0
def test_is_nested_or_struct():
    struct_ex = pa.struct([pa.field('a', pa.int32()),
                           pa.field('b', pa.int8()),
                           pa.field('c', pa.string())])

    assert types.is_struct(struct_ex)
    assert not types.is_struct(pa.list_(pa.int32()))

    assert types.is_nested(struct_ex)
    assert types.is_nested(pa.list_(pa.int32()))
    assert not types.is_nested(pa.int32())
Esempio n. 15
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True
Esempio n. 16
0
def test_schema_equals_propagates_check_metadata():
    # ARROW-4088
    schema1 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string())
    ])
    schema2 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string(), metadata={'a': 'alpha'}),
    ])
    assert not schema1.equals(schema2)
    assert schema1.equals(schema2, check_metadata=False)
Esempio n. 17
0
def test_nested_lists(seq):
    data = [[], [1, 2], None]
    arr = pa.array(seq(data))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data
    # With explicit type
    arr = pa.array(seq(data), type=pa.list_(pa.int32()))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int32())
    assert arr.to_pylist() == data
Esempio n. 18
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Esempio n. 19
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False
Esempio n. 20
0
def test_struct_from_tuples():
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])

    data = [(5, 'foo', True),
            (6, 'bar', False)]
    expected = [{'a': 5, 'b': 'foo', 'c': True},
                {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data, type=ty)

    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data_as_ndarray, type=ty)
    assert arr.to_pylist() == expected

    assert arr.equals(arr2)

    # With omitted values
    data = [(5, 'foo', None),
            None,
            (6, None, False)]
    expected = [{'a': 5, 'b': 'foo', 'c': None},
                None,
                {'a': 6, 'b': None, 'c': False}]
    arr = pa.array(data, type=ty)
    assert arr.to_pylist() == expected

    # Invalid tuple size
    for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
        with pytest.raises(ValueError, match="(?i)tuple size"):
            pa.array([tup], type=ty)
Esempio n. 21
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Esempio n. 22
0
def test_recordbatch_basics():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]

    batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1'])
    assert not batch.schema.metadata

    assert len(batch) == 5
    assert batch.num_rows == 5
    assert batch.num_columns == len(data)
    assert batch.to_pydict() == OrderedDict([
        ('c0', [0, 1, 2, 3, 4]),
        ('c1', [-10, -5, 0, 5, 10])
    ])

    with pytest.raises(IndexError):
        # bounds checking
        batch[2]

    # Schema passed explicitly
    schema = pa.schema([pa.field('c0', pa.int16()),
                        pa.field('c1', pa.int32())],
                       metadata={b'foo': b'bar'})
    batch = pa.RecordBatch.from_arrays(data, schema)
    assert batch.schema == schema
Esempio n. 23
0
def test_cast_integers_safe():
    safe_cases = [
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='f8'), pa.float64())
    ]

    for case in safe_cases:
        _check_cast_case(case)

    unsafe_cases = [
        (np.array([50000], dtype='i4'), 'int32', 'int16'),
        (np.array([70000], dtype='i4'), 'int32', 'uint16'),
        (np.array([-1], dtype='i4'), 'int32', 'uint16'),
        (np.array([50000], dtype='u2'), 'uint16', 'int16')
    ]
    for in_data, in_type, out_type in unsafe_cases:
        in_arr = pa.array(in_data, type=in_type)

        with pytest.raises(pa.ArrowInvalid):
            in_arr.cast(out_type)
Esempio n. 24
0
    def test_buffer_lifetime(self):
        # ARROW-2195
        arr = pa.array([1, 12, 23, 3, 34], pa.int32())
        batch = pa.RecordBatch.from_arrays([arr], ['field1'])

        # Serialize RecordBatch into Plasma store
        sink = pa.MockOutputStream()
        writer = pa.RecordBatchStreamWriter(sink, batch.schema)
        writer.write_batch(batch)
        writer.close()

        object_id = random_object_id()
        data_buffer = self.plasma_client.create(object_id, sink.size())
        stream = pa.FixedSizeBufferWriter(data_buffer)
        writer = pa.RecordBatchStreamWriter(stream, batch.schema)
        writer.write_batch(batch)
        writer.close()
        self.plasma_client.seal(object_id)
        del data_buffer

        # Unserialize RecordBatch from Plasma store
        [data_buffer] = self.plasma_client2.get_buffers([object_id])
        reader = pa.RecordBatchStreamReader(data_buffer)
        read_batch = reader.read_next_batch()
        # Lose reference to returned buffer.  The RecordBatch must still
        # be backed by valid memory.
        del data_buffer, reader

        assert read_batch.equals(batch)
Esempio n. 25
0
File: jvm.py Progetto: rok/arrow
def _from_jvm_int_type(jvm_type):
    """
    Convert a JVM int type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int

    Returns
    -------
    typ: pyarrow.DataType
    """
    if jvm_type.isSigned:
        if jvm_type.bitWidth == 8:
            return pa.int8()
        elif jvm_type.bitWidth == 16:
            return pa.int16()
        elif jvm_type.bitWidth == 32:
            return pa.int32()
        elif jvm_type.bitWidth == 64:
            return pa.int64()
    else:
        if jvm_type.bitWidth == 8:
            return pa.uint8()
        elif jvm_type.bitWidth == 16:
            return pa.uint16()
        elif jvm_type.bitWidth == 32:
            return pa.uint32()
        elif jvm_type.bitWidth == 64:
            return pa.uint64()
Esempio n. 26
0
def test_floating_point_truncate_unsafe():
    unsafe_cases = [
        (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32',
         np.array([1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([1.1, 2.2, 3.3], dtype='float64'), 'float64',
         np.array([1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([-10.1, 20.2, -30.3], dtype='float64'), 'float64',
         np.array([-10, 20, -30], dtype='i4'), pa.int32()),
    ]
    for case in unsafe_cases:
        # test safe casting raises
        with pytest.raises(pa.ArrowInvalid,
                           match='Floating point value truncated'):
            _check_cast_case(case, safe=True)

        # test unsafe casting truncates
        _check_cast_case(case, safe=False)
Esempio n. 27
0
def test_type_list():
    value_type = pa.int32()
    list_type = pa.list_(value_type)
    assert str(list_type) == 'list<item: int32>'

    field = pa.field('my_item', pa.string())
    l2 = pa.list_(field)
    assert str(l2) == 'list<my_item: string>'
Esempio n. 28
0
def test_struct_from_mixed_sequence():
    # It is forbidden to mix dicts and tuples when initializing a struct array
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])
    data = [(5, 'foo', True),
            {'a': 6, 'b': 'bar', 'c': False}]
    with pytest.raises(TypeError):
        pa.array(data, type=ty)
Esempio n. 29
0
    def do_get(self, ticket):
        data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())]
        data2 = [pa.array([-10.0, -5.0, 0.0, 5.0, 10.0], type=pa.float64())]
        assert data1.type != data2.type
        table1 = pa.Table.from_arrays(data1, names=['a'])
        table2 = pa.Table.from_arrays(data2, names=['a'])
        assert table1.schema == self.schema

        return flight.GeneratorStream(self.schema, [table1, table2])
Esempio n. 30
0
def test_field_flatten():
    f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'})
    assert f0.flatten() == [f0]

    f1 = pa.field('bar', pa.float64(), nullable=False)
    ff = pa.field('ff', pa.struct([f0, f1]), nullable=False)
    assert ff.flatten() == [
        pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}),
        pa.field('ff.bar', pa.float64(), nullable=False)]  # XXX

    # Nullable parent makes flattened child nullable
    ff = pa.field('ff', pa.struct([f0, f1]))
    assert ff.flatten() == [
        pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}),
        pa.field('ff.bar', pa.float64())]

    fff = pa.field('fff', pa.struct([ff]))
    assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))]
Esempio n. 31
0
def test_is_floating():
    for t in [pa.float16(), pa.float32(), pa.float64()]:
        assert types.is_floating(t)

    assert not types.is_floating(pa.int32())
Esempio n. 32
0
    pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
    parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
    parquet_file.metadata.row_group(0).column(0).statistics


@pytest.mark.pandas
@pytest.mark.parametrize(
    ('data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count',
     'num_values', 'distinct_count'), [
         ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
         ([-1.1, 2.2, 2.3, None, 4.4
           ], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0),
         ([-1.1, 2.2, 2.3, None, 4.4
           ], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0),
         (['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'',
          chr(1000).encode('utf-8'), 1, 4, 0),
         ([True, False, False, True, True
           ], pa.bool_(), 'BOOLEAN', False, True, 0, 5, 0),
         ([b'\x00', b'b', b'12', None, b'aaa'
           ], pa.binary(), 'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0),
     ])
def test_parquet_column_statistics_api(data, type, physical_type, min_value,
                                       max_value, null_count, num_values,
                                       distinct_count):
Esempio n. 33
0
def pyarrow_types_from_pandas(
        df: pd.DataFrame,
        index: bool,
        ignore_cols: Optional[List[str]] = None,
        index_left: bool = False) -> Dict[str, pa.DataType]:
    """Extract the related Pyarrow data types from any Pandas DataFrame."""
    # Handle exception data types (e.g. Int64, Int32, string)
    ignore_cols = [] if ignore_cols is None else ignore_cols
    cols: List[str] = []
    cols_dtypes: Dict[str, Optional[pa.DataType]] = {}
    for name, dtype in df.dtypes.to_dict().items():
        dtype = str(dtype)
        if name in ignore_cols:
            cols_dtypes[name] = None
        elif dtype == "Int8":
            cols_dtypes[name] = pa.int8()
        elif dtype == "Int16":
            cols_dtypes[name] = pa.int16()
        elif dtype == "Int32":
            cols_dtypes[name] = pa.int32()
        elif dtype == "Int64":
            cols_dtypes[name] = pa.int64()
        elif dtype == "string":
            cols_dtypes[name] = pa.string()
        else:
            cols.append(name)

    # Filling cols_dtypes
    for col in cols:
        _logger.debug("Inferring PyArrow type from column: %s", col)
        try:
            schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]],
                                                      preserve_index=False)
        except pa.ArrowInvalid as ex:
            cols_dtypes[col] = process_not_inferred_dtype(ex)
        except TypeError as ex:
            msg = str(ex)
            if " is required (got type " in msg:
                raise TypeError(
                    f"The {col} columns has a too generic data type ({df[col].dtype}) and seems "
                    f"to have mixed data types ({msg}). "
                    "Please, cast this columns with a more deterministic data type "
                    f"(e.g. df['{col}'] = df['{col}'].astype('string')) or "
                    "pass the column schema as argument for AWS Data Wrangler "
                    f"(e.g. dtype={{'{col}': 'string'}}") from ex
            raise
        else:
            cols_dtypes[col] = schema.field(col).type

    # Filling indexes
    indexes: List[str] = []
    if index is True:
        for field in pa.Schema.from_pandas(df=df[[]], preserve_index=True):
            name = str(field.name)
            _logger.debug("Inferring PyArrow type from index: %s", name)
            cols_dtypes[name] = field.type
            indexes.append(name)

    # Merging Index
    sorted_cols: List[str] = indexes + list(
        df.columns) if index_left is True else list(df.columns) + indexes

    # Filling schema
    columns_types: Dict[str, pa.DataType]
    columns_types = {n: cols_dtypes[n] for n in sorted_cols}
    _logger.debug("columns_types: %s", columns_types)
    return columns_types
Esempio n. 34
0
def test_type_comparisons():
    val = pa.int32()
    assert val == pa.int32()
    assert val == 'int32'
    assert val != 5
Esempio n. 35
0
def test_field_id_metadata():
    # ARROW-7080
    field_id = b'PARQUET:field_id'
    inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
    middle = pa.field('middle',
                      pa.struct([inner]),
                      metadata={field_id: b'101'})
    fields = [
        pa.field('basic',
                 pa.int32(),
                 metadata={
                     b'other': b'abc',
                     field_id: b'1'
                 }),
        pa.field('list',
                 pa.list_(
                     pa.field('list-inner',
                              pa.int32(),
                              metadata={field_id: b'10'})),
                 metadata={field_id: b'11'}),
        pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
        pa.field('no-metadata', pa.int32()),
        pa.field('non-integral-field-id',
                 pa.int32(),
                 metadata={field_id: b'xyz'}),
        pa.field('negative-field-id',
                 pa.int32(),
                 metadata={field_id: b'-1000'})
    ]
    arrs = [[] for _ in fields]
    table = pa.table(arrs, schema=pa.schema(fields))

    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()

    pf = pq.ParquetFile(pa.BufferReader(contents))
    schema = pf.schema_arrow

    assert schema[0].metadata[field_id] == b'1'
    assert schema[0].metadata[b'other'] == b'abc'

    list_field = schema[1]
    assert list_field.metadata[field_id] == b'11'

    list_item_field = list_field.type.value_field
    assert list_item_field.metadata[field_id] == b'10'

    struct_field = schema[2]
    assert struct_field.metadata[field_id] == b'102'

    struct_middle_field = struct_field.type[0]
    assert struct_middle_field.metadata[field_id] == b'101'

    struct_inner_field = struct_middle_field.type[0]
    assert struct_inner_field.metadata[field_id] == b'100'

    assert schema[3].metadata is None
    # Invalid input is passed through (ok) but does not
    # have field_id in parquet (not tested)
    assert schema[4].metadata[field_id] == b'xyz'
    assert schema[5].metadata[field_id] == b'-1000'
Esempio n. 36
0
def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())
Esempio n. 37
0
def test_schema_pyarrow_types():
    field_name = "column1"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "int",
            "bitWidth": 8,
            "isSigned": True
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.int8()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_no_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "timestamp"
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("ns")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_with_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "timestamp",
            "unit": "MICROSECOND"
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("us")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "date_with_day_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "date",
            "unit": "DAY"
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.date32()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_list"
    pyarrow_field = pyarrow_field_from_dict({
        "name":
        field_name,
        "nullable":
        False,
        "metadata":
        metadata,
        "type": {
            "name": "list"
        },
        "children": [{
            "type": {
                "name": "int",
                "bitWidth": 32,
                "isSigned": True
            }
        }],
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.list_(
        pyarrow.field("element", pyarrow.int32()))
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "dictionary"
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "int",
            "bitWidth": 32,
            "isSigned": True
        },
        "children": [],
        "dictionary": {
            "id": 0,
            "indexType": {
                "name": "int",
                "bitWidth": 16,
                "isSigned": True
            },
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "struct_array"
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "list"
        },
        "children": [],
        "dictionary": {
            "id": 0,
            "indexType": {
                "name": "int",
                "bitWidth": 32,
                "isSigned": True
            },
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(
        pyarrow.int32(),
        pyarrow.list_(
            pyarrow.field(
                "element",
                pyarrow.struct(
                    [pyarrow.field("val", pyarrow.int32(), False, metadata)]),
            )),
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_dictionary"
    pyarrow_field = pyarrow_field_from_dict({
        "name":
        field_name,
        "metadata": {
            "metadata_k": "metadata_v"
        },
        "nullable":
        False,
        "type": {
            "name": "dictionary"
        },
        "dictionary": {
            "indexType": {
                "type": {
                    "name": "int",
                    "bitWidth": 8
                }
            }
        },
        "children": [{
            "type": {
                "name": "int",
                "bitWidth": 32
            }
        }],
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    pyarrow_field = pyarrow_field_from_dict({
        "name":
        field_name,
        "type": {
            "name": "struct"
        },
        "children": [{
            "name": "x",
            "type": {
                "name": "int",
                "bitWidth": 64
            },
            "nullable": True,
            "metadata": {},
        }],
        "metadata": {
            "metadata_k": "metadata_v"
        },
        "nullable":
        False,
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.struct(
        [pyarrow.field("x", pyarrow.int64(), True, {})])
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False
Esempio n. 38
0
        "instrument_id": pa.dictionary(pa.int64(), pa.string()),
        "bid": pa.string(),
        "ask": pa.string(),
        "bid_size": pa.string(),
        "ask_size": pa.string(),
        "last": pa.string(),
        "ts_event": pa.int64(),
        "ts_init": pa.int64(),
    }),
    BinanceBar:
    pa.schema({
        "bar_type": pa.dictionary(pa.int8(), pa.string()),
        "instrument_id": pa.dictionary(pa.int64(), pa.string()),
        "open": pa.string(),
        "high": pa.string(),
        "low": pa.string(),
        "close": pa.string(),
        "volume": pa.string(),
        "quote_volume": pa.string(),
        "count": pa.int32(),
        "taker_buy_base_volume": pa.string(),
        "taker_buy_quote_volume": pa.string(),
        "ts_event": pa.int64(),
        "ts_init": pa.int64(),
    }),
}

# default schemas
for cls, schema in NAUTILUS_PARQUET_SCHEMA.items():
    register_parquet(cls, schema=schema)
Esempio n. 39
0
        pa.field('b', pa.int32(), nullable=False)
    ]
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i


@pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32),
                                          (pa.date64(), types.is_date64),
                                          (pa.time32('s'), types.is_time32),
                                          (pa.time64('ns'), types.is_time64),
                                          (pa.int8(), types.is_int8),
                                          (pa.int16(), types.is_int16),
                                          (pa.int32(), types.is_int32),
                                          (pa.int64(), types.is_int64),
                                          (pa.uint8(), types.is_uint8),
                                          (pa.uint16(), types.is_uint16),
                                          (pa.uint32(), types.is_uint32),
                                          (pa.uint64(), types.is_uint64),
                                          (pa.float16(), types.is_float16),
                                          (pa.float32(), types.is_float32),
                                          (pa.float64(), types.is_float64)])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)


def test_type_id():
    # enum values are not exposed publicly
    for ty in get_many_types():
Esempio n. 40
0
def test_sql(parameters, db_type):
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
    wr.db.to_sql(
        df=df,
        con=engine,
        name="test_sql",
        schema=parameters[db_type]["schema"],
        if_exists="replace",
        index=False,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=parameters[db_type]["host"],
        port=parameters[db_type]["port"],
        database=parameters[db_type]["database"],
        user=parameters["user"],
        password=parameters["password"],
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name="test_sql",
            schema=parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table="test_sql",
                                  schema=schema,
                                  index_col="index")
        assert len(df.index) == 3
        assert len(df.columns) == 1
Esempio n. 41
0
def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())
Esempio n. 42
0
import numpy as np
import pandas as pd
import pyarrow as pa
from pandas.core.dtypes.common import infer_dtype_from_object
from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType

import cudf
from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar

_NA_REP = "<NA>"
_np_pa_dtypes = {
    np.float64: pa.float64(),
    np.float32: pa.float32(),
    np.int64: pa.int64(),
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
Esempio n. 43
0
def test_iterate_over_timestamp_tz_chunk():
    random.seed(datetime.datetime.now())
    scale = random.randint(0, 9)
    column_meta = [{
        "byteLength": "16" if scale > 3 else "8",
        "logicalType": "TIMESTAMP_TZ",
        "scale": str(scale)
    }, {
        "byteLength": "16" if scale > 3 else "8",
        "logicalType": "TIMESTAMP_TZ",
        "scale": str(scale)
    }]

    type1 = pyarrow.struct([
        pyarrow.field('epoch', pyarrow.int64()),
        pyarrow.field('timezone', pyarrow.int32()),
        pyarrow.field('fraction', pyarrow.int32())
    ])
    type2 = pyarrow.struct([
        pyarrow.field('epoch', pyarrow.int64()),
        pyarrow.field('timezone', pyarrow.int32())
    ])
    data_type = type1 if scale > 3 else type2

    def timestamp_tz_generator(scale):
        epoch = random.randint(-621355968, 2534023007)
        frac = random.randint(0, 10**scale - 1) * (10**(
            9 - scale)) if scale > 3 else random.randint(0, 10**scale - 1)
        timezone = random.randint(1, 2879)
        if scale > 3:
            return {'epoch': epoch, 'timezone': timezone, 'fraction': frac}
        else:
            epoch = str(epoch)
            frac = str(frac)
            ZEROFILL = '000000000'
            frac = ZEROFILL[:scale - len(frac)] + frac
            return {
                'epoch': int(epoch + frac) if scale else int(epoch),
                'timezone': timezone
            }

    def expected_data_transform_tz(_scale):
        def expected_data_transform_tz_impl(data, scale=_scale):
            timezone = data['timezone']
            tzinfo = _generate_tzinfo_from_tzoffset(timezone - 1440)
            epoch = data['epoch']
            if scale > 3:
                frac = data['fraction']
                if epoch < 0:
                    epoch += 1
                    frac = 10**9 - frac
                frac = str(int(frac / 10**(9 - scale)))
                ZERO_FILL = '000000000'
                frac = ZERO_FILL[:scale - len(frac)] + frac
                epoch = int(str(epoch) + frac)

            microsec = str(epoch)
            if scale > 6:
                microsec = microsec[:-scale] + "." + microsec[-scale:-scale +
                                                              6]
            else:
                microsec = microsec[:-scale] + "." + microsec[
                    -scale:] if scale else microsec

            if platform.system() == 'Windows':
                t = datetime.datetime.utcfromtimestamp(0) + datetime.timedelta(
                    seconds=(float(microsec)))
                if pytz.utc != tzinfo:
                    t += tzinfo.utcoffset(t)
                return t.replace(tzinfo=tzinfo)
            else:
                return datetime.datetime.fromtimestamp(float(microsec),
                                                       tz=tzinfo)

        return expected_data_transform_tz_impl

    iterate_over_test_chunk([data_type, data_type], column_meta,
                            lambda: timestamp_tz_generator(scale),
                            expected_data_transform_tz(scale))
Esempio n. 44
0
def test_iterate_over_timestamp_ltz_chunk():
    random.seed(datetime.datetime.now())
    scale = random.randint(0, 9)
    column_meta = [{
        "logicalType": "TIMESTAMP_LTZ",
        "scale": str(scale)
    }, {
        "logicalType": "TIMESTAMP_LTZ",
        "scale": str(scale)
    }]
    data_type = pyarrow.struct([
        pyarrow.field('epoch', pyarrow.int64()),
        pyarrow.field('fraction', pyarrow.int32())
    ]) if scale > 7 else pyarrow.int64()

    def timestamp_ltz_generator(scale):
        epoch = random.randint(-621355968, 2534023007)
        frac = random.randint(0, 10**scale - 1) * (10**(
            9 - scale)) if scale > 7 else random.randint(0, 10**scale - 1)
        if scale > 7:
            return {'epoch': epoch, 'fraction': frac}
        else:
            epoch = str(epoch)
            frac = str(frac)
            ZEROFILL = '000000000'
            frac = ZEROFILL[:scale - len(frac)] + frac
            return int(epoch + frac) if scale else int(epoch)

    def expected_data_transform_ltz(_scale):
        def expected_data_transform_ltz_impl(data, scale=_scale):
            tzinfo = get_timezone(
            )  # can put a string parameter here in the future
            if scale > 7:
                frac = data['fraction']
                epoch = data['epoch']
                if epoch < 0:
                    epoch += 1
                    frac = 10**9 - frac
                frac = str(int(frac / 10**(9 - scale)))
                ZERO_FILL = '000000000'
                frac = ZERO_FILL[:scale - len(frac)] + frac
                data = int(str(epoch) + frac)

            microsec = str(data)
            if scale > 6:
                microsec = microsec[:-scale] + "." + microsec[-scale:-scale +
                                                              6]
            else:
                microsec = microsec[:-scale] + "." + microsec[
                    -scale:] if scale else microsec

            if platform.system() == 'Windows':
                t0 = datetime.datetime.utcfromtimestamp(
                    0) + datetime.timedelta(seconds=(float(microsec)))
                return pytz.utc.localize(t0, is_dst=False).astimezone(tzinfo)
            else:
                return datetime.datetime.fromtimestamp(float(microsec),
                                                       tz=tzinfo)

        return expected_data_transform_ltz_impl

    iterate_over_test_chunk([data_type, data_type], column_meta,
                            lambda: timestamp_ltz_generator(scale),
                            expected_data_transform_ltz(scale))
Esempio n. 45
0
def test_is_primitive():
    assert types.is_primitive(pa.int32())
    assert not types.is_primitive(pa.list_(pa.int32()))
Esempio n. 46
0
def test_is_decimal():
    assert types.is_decimal(pa.decimal128(19, 4))
    assert not types.is_decimal(pa.int32())
Esempio n. 47
0
import six

_python_type_map = {
    pa.null().id:
    six.text_type,
    pa.bool_().id:
    bool,
    pa.int8().id:
    int,
    pa.uint8().id:
    int,
    pa.int16().id:
    int,
    pa.uint16().id:
    int,
    pa.int32().id:
    int,
    pa.uint32().id:
    int,
    pa.int64().id:
    int,
    pa.uint64().id:
    int,
    pa.float16().id:
    float,
    pa.float32().id:
    float,
    pa.float64().id:
    float,
    pa.date32().id:
    datetime.date,
Esempio n. 48
0
def test_is_null():
    assert types.is_null(pa.null())
    assert not types.is_null(pa.list_(pa.int32()))
Esempio n. 49
0
def test_writing_empty_lists():
    # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
    arr1 = pa.array([[], []], pa.list_(pa.int32()))
    table = pa.Table.from_arrays([arr1], ['list(int32)'])
    _check_roundtrip(table)
Esempio n. 50
0
@pytest.fixture(scope="session")
def arrow_table():
    return pa.Table.from_pydict({
        "col_int": [0, 1, 2],
        "col_float": [0.0, 1.0, 2.0]
    })


@require_tf
@pytest.mark.parametrize(
    "cast_schema",
    [
        None,
        [("col_int", pa.int64()), ("col_float", pa.float64())],
        [("col_int", pa.int32()), ("col_float", pa.float64())],
        [("col_int", pa.int64()), ("col_float", pa.float32())],
    ],
)
def test_tf_formatter_sets_default_dtypes(cast_schema, arrow_table):
    import tensorflow as tf

    from datasets.formatting import TFFormatter

    if cast_schema:
        arrow_table = arrow_table.cast(pa.schema(cast_schema))
    arrow_table_dict = arrow_table.to_pydict()
    list_int = arrow_table_dict["col_int"]
    list_float = arrow_table_dict["col_float"]
    formatter = TFFormatter()
Esempio n. 51
0
def test_empty_lists_table_roundtrip(use_legacy_dataset):
    # ARROW-2744: Shouldn't crash when writing an array of empty lists
    arr = pa.array([[], []], type=pa.list_(pa.int32()))
    table = pa.Table.from_arrays([arr], ["A"])
    _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
Esempio n. 52
0
def test_fields_weakrefable():
    field = pa.field('a', pa.int32())
    wr = weakref.ref(field)
    assert wr() is not None
    del field
    assert wr() is None
Esempio n. 53
0
 def ArrowSchema(self):
     return pa.schema(
         [pa.field(c, pa.list_(pa.int32())) for c in self._columns])
Esempio n. 54
0
import pyarrow as pa

# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E))

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=1, max_value=38),
                         scale=st.integers(min_value=1, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
Esempio n. 55
0
except:
    CYTHON = False

try:
    import pyarrow as pa
    from pyarrow import csv
    import numpy as np
    ARROW = True
except:
    ARROW = False
else:
    sqream_to_pa = {
        'ftBool': pa.bool_(),
        'ftUByte': pa.uint8(),
        'ftShort': pa.int16(),
        'ftInt': pa.int32(),
        'ftLong': pa.int64(),
        'ftFloat': pa.float32(),
        'ftDouble': pa.float64(),
        'ftDate': pa.timestamp('ns'),
        'ftDateTime': pa.timestamp('ns'),
        'ftVarchar': pa.string(),
        'ftBlob': pa.utf8()
    }

__version__ = '3.0.0'

PROTOCOL_VERSION = 7
BUFFER_SIZE = 100 * int(1e6)  # For setting auto-flushing on netrwork insert
ROWS_PER_FLUSH = 100000
DEFAULT_CHUNKSIZE = 0  # Dummy variable for some jsons
Esempio n. 56
0
    def csv_to_table(self,
                     csv_path,
                     table_name,
                     read=None,
                     parse=None,
                     convert=None,
                     con=None,
                     auto_infer=False):
        ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html '

        if not ARROW:
            return "Optional pyarrow dependency not found. To install: pip3 install pyarrow"

        sqream_to_pa = {
            'ftBool': pa.bool_(),
            'ftUByte': pa.uint8(),
            'ftShort': pa.int16(),
            'ftInt': pa.int32(),
            'ftLong': pa.int64(),
            'ftFloat': pa.float32(),
            'ftDouble': pa.float64(),
            'ftDate': pa.timestamp('ns'),
            'ftDateTime': pa.timestamp('ns'),
            'ftVarchar': pa.string(),
            'ftBlob': pa.utf8()
        }

        start = time.time()
        # Get table metadata
        con = con or self
        con.execute(f'select * from {table_name} where 1=0')

        # Map column names to pyarrow types and set Arrow's CSV parameters
        sqream_col_types = [col_type[0] for col_type in con.col_type_tups]
        column_types = zip(
            con.col_names,
            [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups])
        read = read or csv.ReadOptions(column_names=con.col_names)
        parse = parse or csv.ParseOptions(delimiter='|')
        convert = convert or csv.ConvertOptions(
            column_types=None if auto_infer else column_types)

        # Read CSV to in-memory arrow format
        csv_arrow = csv.read_csv(csv_path,
                                 read_options=read,
                                 parse_options=parse,
                                 convert_options=convert).combine_chunks()
        num_chunks = len(csv_arrow[0].chunks)
        numpy_cols = []

        # For each column, get the numpy representation for quick packing
        for col_type, col in zip(sqream_col_types, csv_arrow):
            # Only one chunk after combine_chunks()
            col = col.chunks[0]
            if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'):
                col = col.to_pandas()
            else:
                col = col.to_numpy()

            numpy_cols.append(col)

        print(f'total loading csv: {time.time()-start}')
        start = time.time()

        # Insert columns into SQream
        col_num = csv_arrow.shape[1]
        con.executemany(
            f'insert into {table_name} values ({"?,"*(col_num-1)}?)',
            numpy_cols)
        print(f'total inserting csv: {time.time()-start}')
Esempio n. 57
0
def test_is_list():
    assert types.is_list(pa.list_(pa.int32()))
    assert not types.is_list(pa.int32())
Esempio n. 58
0
# them using Java code as well as enables us to define them as parameters
# without to invoke the JVM.
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize(
    'pa_type,jvm_spec',
    [
        (pa.null(), '{"name":"null"}'),
        (pa.bool_(), '{"name":"bool"}'),
        (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
        (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
        (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
        (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
        (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
        (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
        (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
        (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
        (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
        (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
        (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
        (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
        (pa.time32('ms'),
         '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
        (pa.time64('us'),
         '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
        (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
        (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
Esempio n. 59
0
import pytest

from pyarrow.compat import unittest, u  # noqa
import pyarrow as pa

import collections
import datetime
import decimal
import itertools
import numpy as np
import six
import pytz

int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int64()),
                  (np.int32, pa.int32()), (np.int64, pa.int64()),
                  (np.uint8, pa.uint8()), (np.uint16, pa.uint64()),
                  (np.uint32, pa.uint32()), (np.uint64, pa.uint64())]

np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()


def check_struct_type(ty, expected):
Esempio n. 60
0
def test_iterate_over_decimal_chunk():
    random.seed(datetime.datetime.now())
    precision = random.randint(1, 38)
    scale = random.randint(0, precision)
    datatype = None
    if precision <= 2:
        datatype = pyarrow.int8()
    elif precision <= 4:
        datatype = pyarrow.int16()
    elif precision <= 9:
        datatype = pyarrow.int32()
    elif precision <= 19:
        datatype = pyarrow.int64()
    else:
        datatype = pyarrow.decimal128(precision, scale)

    def decimal_generator(_precision, _scale):
        def decimal128_generator(precision, scale):
            data = []
            for i in range(precision):
                data.append(str(random.randint(0, 9)))

            if scale:
                data.insert(-scale, '.')
            return decimal.Decimal("".join(data))

        def int64_generator(precision):
            data = random.randint(-9223372036854775808, 9223372036854775807)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int32_generator(precision):
            data = random.randint(-2147483648, 2147483637)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int16_generator(precision):
            data = random.randint(-32768, 32767)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int8_generator(precision):
            data = random.randint(-128, 127)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        if _precision <= 2:
            return int8_generator(_precision)
        elif _precision <= 4:
            return int16_generator(_precision)
        elif _precision <= 9:
            return int32_generator(_precision)
        elif _precision <= 19:
            return int64_generator(_precision)
        else:
            return decimal128_generator(_precision, _scale)

    def expected_data_transform_decimal(_precision, _scale):
        def expected_data_transform_decimal_impl(data,
                                                 precision=_precision,
                                                 scale=_scale):
            if precision <= 19:
                return decimal.Decimal(data).scaleb(-scale)
            else:
                return data

        return expected_data_transform_decimal_impl

    column_meta = {
        "logicalType": "FIXED",
        "precision": str(precision),
        "scale": str(scale)
    }
    iterate_over_test_chunk([datatype, datatype], [column_meta, column_meta],
                            lambda: decimal_generator(precision, scale),
                            expected_data_transform_decimal(precision, scale))