コード例 #1
0
ファイル: test_array.py プロジェクト: rok/arrow
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
コード例 #2
0
ファイル: test_csv.py プロジェクト: wesm/arrow
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    assert opts.check_utf8 is True
    opts.check_utf8 = False
    assert opts.check_utf8 is False

    assert opts.strings_can_be_null is False
    opts.strings_can_be_null = True
    assert opts.strings_can_be_null is True

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    opts = cls(check_utf8=False, column_types={'a': pa.null()},
               null_values=['N', 'nn'], true_values=['T', 'tt'],
               false_values=['F', 'ff'], strings_can_be_null=True)
    assert opts.check_utf8 is False
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.strings_can_be_null is True
コード例 #3
0
ファイル: test_array.py プロジェクト: emkornfield/arrow
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
コード例 #4
0
ファイル: test_csv.py プロジェクト: laurentgo/arrow
    def test_custom_nulls(self):
        # Infer nulls with custom values
        opts = ConvertOptions(null_values=['Xxx', 'Zzz'])
        rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.null()),
                            ('b', pa.string()),
                            ('c', pa.string()),
                            ('d', pa.int64())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [None, None],
            'b': [u"Xxx", u"#N/A"],
            'c': [u"1", u""],
            'd': [2, None],
            }

        opts = ConvertOptions(null_values=[])
        rows = b"a,b\n#N/A,\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.string()),
                            ('b', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [u"#N/A"],
            'b': [u""],
            }
コード例 #5
0
ファイル: test_schema.py プロジェクト: giantwhale/arrow
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
コード例 #6
0
ファイル: jvm.py プロジェクト: rok/arrow
def field(jvm_field):
    """
    Construct a Field from a org.apache.arrow.vector.types.pojo.Field
    instance.

    Parameters
    ----------
    jvm_field: org.apache.arrow.vector.types.pojo.Field

    Returns
    -------
    pyarrow.Field
    """
    name = jvm_field.getName()
    jvm_type = jvm_field.getType()

    typ = None
    if not jvm_type.isComplex():
        type_str = jvm_type.getTypeID().toString()
        if type_str == 'Null':
            typ = pa.null()
        elif type_str == 'Int':
            typ = _from_jvm_int_type(jvm_type)
        elif type_str == 'FloatingPoint':
            typ = _from_jvm_float_type(jvm_type)
        elif type_str == 'Utf8':
            typ = pa.string()
        elif type_str == 'Binary':
            typ = pa.binary()
        elif type_str == 'FixedSizeBinary':
            typ = pa.binary(jvm_type.getByteWidth())
        elif type_str == 'Bool':
            typ = pa.bool_()
        elif type_str == 'Time':
            typ = _from_jvm_time_type(jvm_type)
        elif type_str == 'Timestamp':
            typ = _from_jvm_timestamp_type(jvm_type)
        elif type_str == 'Date':
            typ = _from_jvm_date_type(jvm_type)
        elif type_str == 'Decimal':
            typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale())
        else:
            raise NotImplementedError(
                "Unsupported JVM type: {}".format(type_str))
    else:
        # TODO: The following JVM types are not implemented:
        #       Struct, List, FixedSizeList, Union, Dictionary
        raise NotImplementedError(
            "JVM field conversion only implemented for primitive types.")

    nullable = jvm_field.isNullable()
    if jvm_field.getMetadata().isEmpty():
        metadata = None
    else:
        metadata = dict(jvm_field.getMetadata())
    return pa.field(name, typ, nullable, metadata)
コード例 #7
0
    def test_nested_lists_all_none(self):
        data = np.array([[None, None], None], dtype=object)

        arr = pa.array(data)
        expected = pa.array(list(data))
        assert arr.equals(expected)
        assert arr.type == pa.list_(pa.null())

        data2 = np.array([None, None, [None, None],
                          np.array([None, None], dtype=object)],
                         dtype=object)
        arr = pa.array(data2)
        expected = pa.array([None, None, [None, None], [None, None]])
        assert arr.equals(expected)
コード例 #8
0
ファイル: test_json.py プロジェクト: rok/arrow
 def test_simple_nulls(self):
     # Infer various kinds of data, with nulls
     rows = (b'{"a": 1, "b": 2, "c": null, "d": null, "e": null}\n'
             b'{"a": null, "b": -5, "c": "foo", "d": null, "e": true}\n'
             b'{"a": 4.5, "b": null, "c": "nan", "d": null,"e": false}\n')
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.null()),
                         ('e', pa.bool_())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, None, 4.5],
         'b': [2, -5, None],
         'c': [None, u"foo", u"nan"],
         'd': [None, None, None],
         'e': [None, True, False],
         }
コード例 #9
0
ファイル: test_csv.py プロジェクト: laurentgo/arrow
 def test_simple_nulls(self):
     # Infer various kinds of data, with nulls
     rows = (b"a,b,c,d,e\n"
             b"1,2,,,3\n"
             b"nan,-5,foo,,nan\n"
             b"4.5,#N/A,nan,,\xff\n")
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.null()),
                         ('e', pa.binary())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, None, 4.5],
         'b': [2, -5, None],
         'c': [u"", u"foo", u"nan"],
         'd': [None, None, None],
         'e': [b"3", b"nan", b"\xff"],
         }
コード例 #10
0
ファイル: test_types.py プロジェクト: giantwhale/arrow
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
コード例 #11
0
ファイル: test_types.py プロジェクト: rok/arrow
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
コード例 #12
0
def test_empty_iterable():
    arr = pa.array(StrangeIterable([]))
    assert len(arr) == 0
    assert arr.null_count == 0
    assert arr.type == pa.null()
    assert arr.to_pylist() == []
コード例 #13
0
def test_empty_list(seq):
    arr = pa.array(seq([]))
    assert len(arr) == 0
    assert arr.null_count == 0
    assert arr.type == pa.null()
    assert arr.to_pylist() == []
コード例 #14
0
    pojo_Field = jpype.JClass('org.apache.arrow.vector.types.pojo.Field')
    return om.readValue(jvm_spec, pojo_Field)


# In the following, we use the JSON serialization of the Field objects in Java.
# This ensures that we neither rely on the exact mechanics on how to construct
# them using Java code as well as enables us to define them as parameters
# without to invoke the JVM.
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize('typ,jvm_spec', [
    (pa.null(), '{"name":"null"}'),
    (pa.bool_(), '{"name":"bool"}'),
    (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
    (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
    (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
    (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
    (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
    (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
    (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
    (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
    (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
    (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
    (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
    (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
    (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
    (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
コード例 #15
0
ファイル: test_types.py プロジェクト: sfc-gh-hyu/arrow
def test_is_null():
    assert types.is_null(pa.null())
    assert not types.is_null(pa.list_(pa.int32()))
コード例 #16
0
ファイル: test_convert_builtin.py プロジェクト: apache/arrow
 def test_all_none(self):
     arr = pyarrow.from_pylist([None, None])
     assert len(arr) == 2
     assert arr.null_count == 2
     assert arr.type == pyarrow.null()
     assert arr.to_pylist() == [None, None]
コード例 #17
0
ファイル: test_convert_builtin.py プロジェクト: dremio/arrow
def test_sequence_all_none(seq):
    arr = pa.array(seq([None, None]))
    assert len(arr) == 2
    assert arr.null_count == 2
    assert arr.type == pa.null()
    assert arr.to_pylist() == [None, None]
コード例 #18
0
ファイル: test_types.py プロジェクト: giantwhale/arrow
def test_is_null():
    assert types.is_null(pa.null())
    assert not types.is_null(pa.list_(pa.int32()))
コード例 #19
0
 def test_all_none(self):
     arr = pa.array([None, None])
     assert len(arr) == 2
     assert arr.null_count == 2
     assert arr.type == pa.null()
     assert arr.to_pylist() == [None, None]
コード例 #20
0
        np.arange(10, dtype=np.float16),
    ]
)
def test_to_numpy_roundtrip(narr):
    arr = pa.array(narr)
    assert narr.dtype == arr.to_numpy().dtype
    np.testing.assert_array_equal(narr, arr.to_numpy())
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
コード例 #21
0
ファイル: base.py プロジェクト: jbrockmendel/fletcher
import pyarrow as pa
import six
from pandas.api.types import (
    is_array_like,
    is_bool_dtype,
    is_int64_dtype,
    is_integer,
    is_integer_dtype,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.dtypes import ExtensionDtype

from ._algorithms import all_op, any_op, extract_isnull_bytemap

_python_type_map = {
    pa.null().id: six.text_type,
    pa.bool_().id: bool,
    pa.int8().id: int,
    pa.uint8().id: int,
    pa.int16().id: int,
    pa.uint16().id: int,
    pa.int32().id: int,
    pa.uint32().id: int,
    pa.int64().id: int,
    pa.uint64().id: int,
    pa.float16().id: float,
    pa.float32().id: float,
    pa.float64().id: float,
    pa.date32().id: datetime.date,
    pa.date64().id: datetime.date,
    pa.timestamp("ms").id: datetime.datetime,
コード例 #22
0
 def test_all_none(self):
     arr = pa.array([None, None])
     assert len(arr) == 2
     assert arr.null_count == 2
     assert arr.type == pa.null()
     assert arr.to_pylist() == [None, None]
コード例 #23
0
ファイル: test_types.py プロジェクト: rok/arrow
def test_is_boolean_value():
    assert not pa.types.is_boolean_value(1)
    assert pa.types.is_boolean_value(True)
    assert pa.types.is_boolean_value(False)
    assert pa.types.is_boolean_value(np.bool_(True))
    assert pa.types.is_boolean_value(np.bool_(False))


@h.given(
    past.all_types |
    past.all_fields |
    past.all_schemas
)
@h.example(
    pa.field(name='', type=pa.null(), metadata={'0': '', '': ''})
)
def test_pickling(field):
    data = pickle.dumps(field)
    assert pickle.loads(data) == field


@h.given(
    st.lists(past.all_types) |
    st.lists(past.all_fields) |
    st.lists(past.all_schemas)
)
def test_hashing(items):
    h.assume(
        # well, this is still O(n^2), but makes the input unique
        all(not a.equals(b) for i, a in enumerate(items) for b in items[:i])
コード例 #24
0
                 pa.array([[3]], type=pa.list_(pa.int32()))
             ], ["f1", "f2"])
     }],
     expected_output={
         "struct<binary, list<int32>>":
             pa.StructArray.from_arrays([
                 pa.array([b"abc", None, b"def", b"ghi"]),
                 pa.array([[None], [1, 2], [], [3]],
                          type=pa.list_(pa.int32()))
             ], ["f1", "f2"])
     }),
 dict(
     testcase_name="missing_or_null_column_fixed_width",
     inputs=[
         {
             "int32": pa.array([None, None], type=pa.null())
         },
         {
             "int64": pa.array([None, None], type=pa.null())
         },
         {
             "int64": pa.array([123], type=pa.int64())
         },
         {
             "int32": pa.array([456], type=pa.int32())
         },
     ],
     expected_output={
         "int32":
             pa.array([None, None, None, None, None, 456], type=pa.int32()),
         "int64":
コード例 #25
0
ファイル: test_array.py プロジェクト: CodingCat/arrow
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
コード例 #26
0
ファイル: strategies.py プロジェクト: emkornfield/arrow
import hypothesis.extra.numpy as npst
import hypothesis.extra.pytz as tzst
import numpy as np

import pyarrow as pa


# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(
        min_codepoint=0x41,
        max_codepoint=0x7E
    )
)

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())

signed_integer_types = st.sampled_from([
    pa.int8(),
    pa.int16(),
    pa.int32(),
    pa.int64()
])
unsigned_integer_types = st.sampled_from([
    pa.uint8(),
    pa.uint16(),
    pa.uint32(),
コード例 #27
0
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')

    assert result.equals(expected)


def test_simple_type_construction():
    result = pa.lib.TimestampType()
    with pytest.raises(TypeError):
        str(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_conversions_no_sentinel_values():
コード例 #28
0
    ]

    for case in safe_cases:
        _check_cast_case(case)


def test_simple_type_construction():
    result = pa.lib.TimestampType()
    with pytest.raises(TypeError):
        str(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'float64'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
コード例 #29
0
             ),
             pa.array([0.1, 0.2], type=pa.float32()),
         ],
         names=["b", "time", "a"],
     ),
 ),
 (
     pd.DataFrame({
         "nanfloat": [None, 1.0],
         "nans": [pd.NA, pd.NA],
         "str": ["a", "b"]
     }),
     pa.Table.from_arrays(
         [
             pa.array([None, 1.0], type=pa.float32()),
             pa.array([None, None], type=pa.null()),
             pa.array(["a", "b"], type=pa.string()),
         ],
         names=["nanfloat", "nans", "str"],
     ),
 ),
 httpfail(
     pd.DataFrame({
         "nanint": [pd.NA, 3],  # arrow doesn't like this
     }),
     None,
 ),
 httpfail(
     pd.DataFrame({
         "nanstr": [pd.NA, "string"],
     }),
コード例 #30
0
ファイル: test_convert_builtin.py プロジェクト: dremio/arrow
def test_empty_range():
    arr = pa.array(range(0))
    assert len(arr) == 0
    assert arr.null_count == 0
    assert arr.type == pa.null()
    assert arr.to_pylist() == []
コード例 #31
0
ファイル: table_util_test.py プロジェクト: tensorflow/tfx-bsl
                 pa.array([[3]], type=pa.list_(pa.int32()))
             ], ["f1", "f2"])
     }],
     expected_output={
         "struct<binary, list<int32>>":
             pa.StructArray.from_arrays([
                 pa.array([b"abc", None, b"def", b"ghi"]),
                 pa.array([[None], [1, 2], [], [3]],
                          type=pa.list_(pa.int32()))
             ], ["f1", "f2"])
     }),
 dict(
     testcase_name="missing_or_null_column_fixed_width",
     inputs=[
         {
             "int32": pa.array([None, None], type=pa.null())
         },
         {
             "int64": pa.array([None, None], type=pa.null())
         },
         {
             "int64": pa.array([123], type=pa.int64())
         },
         {
             "int32": pa.array([456], type=pa.int32())
         },
     ],
     expected_output={
         "int32":
             pa.array([None, None, None, None, None, 456], type=pa.int32()),
         "int64":
コード例 #32
0
_DECODE_CASES = [
    dict(testcase_name="without_schema_first_example_typed",
         schema_text_proto=None,
         sequence_examples_text_proto=[
             _TYPED_SEQUENCE_EXAMPLE, _UNTYPED_SEQUENCE_EXAMPLE,
             _SOME_FEATURES_TYPED_SEQUENCE_EXAMPLE,
             _EMPTY_VALUES_LIST_SEQUENCE_EXAMPLE
         ],
         expected=pa.RecordBatch.from_arrays([
             pa.array([[1], None, None, []], type=pa.large_list(pa.int64())),
             pa.array([[1.0, 2.0], None, None, []],
                      type=pa.large_list(pa.float32())),
             pa.array([[b"a", b"b", b"c"], None, None, []],
                      type=pa.large_list(pa.large_binary())),
             pa.array([None, None, None, None], pa.null()),
             pa.array([None, None, [1.0], None],
                      type=pa.large_list(pa.float32())),
             pa.StructArray.from_arrays([
                 pa.array([None, None, [[1.0]], None],
                          type=pa.large_list(pa.large_list(pa.float32()))),
                 pa.array([[[1, 2], [3]], [], [None, None, None], [[], []]],
                          type=pa.large_list(pa.large_list(pa.int64()))),
                 pa.array([[[3.0, 4.0], [1.0, 2.0]], [], [None], [[]]],
                          type=pa.large_list(pa.large_list(pa.float32()))),
                 pa.array([[[b"a", b"b"], [b"c"]], [], [None], [[]]],
                          type=pa.large_list(pa.large_list(pa.large_binary())))
             ],
                                        names=[
                                            "sequence_v", "sequence_x",
                                            "sequence_y", "sequence_z"
コード例 #33
0
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    check_options_class(cls,
                        check_utf8=[True, False],
                        strings_can_be_null=[False, True],
                        include_columns=[[], ['def', 'abc']],
                        include_missing_columns=[False, True],
                        auto_dict_encode=[False, True],
                        timestamp_parsers=[[], [ISO8601, '%y-%m']])

    assert opts.auto_dict_max_cardinality > 0
    opts.auto_dict_max_cardinality = 99999
    assert opts.auto_dict_max_cardinality == 99999

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    assert opts.timestamp_parsers == []
    opts.timestamp_parsers = [ISO8601]
    assert opts.timestamp_parsers == [ISO8601]

    opts = cls(column_types={'a': pa.null()},
               null_values=['N', 'nn'],
               true_values=['T', 'tt'],
               false_values=['F', 'ff'],
               auto_dict_max_cardinality=999,
               timestamp_parsers=[ISO8601, '%Y-%m-%d'])
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.auto_dict_max_cardinality == 999
    assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d']
コード例 #34
0
ファイル: test_types.py プロジェクト: sfc-gh-hyu/arrow
def test_null_field_may_not_be_non_nullable():
    # ARROW-7273
    with pytest.raises(ValueError):
        pa.field('f0', pa.null(), nullable=False)
コード例 #35
0
 def test_header(self):
     rows = b"abc,def,gh\n"
     reader = self.open_bytes(rows)
     expected_schema = pa.schema([('abc', pa.null()), ('def', pa.null()),
                                  ('gh', pa.null())])
     self.check_reader(reader, expected_schema, [])
コード例 #36
0
def test_empty_range():
    arr = pa.array(range(0))
    assert len(arr) == 0
    assert arr.null_count == 0
    assert arr.type == pa.null()
    assert arr.to_pylist() == []
コード例 #37
0
    def test_column_options(self):
        # With column_names
        rows = b"1,2,3\n4,5,6"
        read_options = ReadOptions()
        read_options.column_names = ['d', 'e', 'f']
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()),
                                     ('f', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'd': [1, 4],
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With include_columns
        convert_options = ConvertOptions()
        convert_options.include_columns = ['f', 'e']
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With column_types
        convert_options.column_types = {'e': pa.string()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        # Missing columns in include_columns
        convert_options.include_columns = ['g', 'f', 'e']
        with pytest.raises(
                KeyError,
                match="Column 'g' in include_columns does not exist"):
            reader = self.open_bytes(rows,
                                     read_options=read_options,
                                     convert_options=convert_options)

        convert_options.include_missing_columns = True
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        convert_options.column_types = {'e': pa.string(), 'g': pa.float64()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])
コード例 #38
0
def test_sequence_all_none(seq):
    arr = pa.array(seq([None, None]))
    assert len(arr) == 2
    assert arr.null_count == 2
    assert arr.type == pa.null()
    assert arr.to_pylist() == [None, None]
コード例 #39
0
 def test_lift_slice_aware(self):
   examples = [
       ('slice1', pa.Table.from_arrays([
           pa.array([['a'], ['a'], ['b'], ['a']]),
           pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
       ], ['categorical_x', 'string_y'])),
       ('slice2', pa.Table.from_arrays([
           pa.array([['a'], ['a'], ['a']]),
           pa.array([['cat'], ['dog'], ['dog']]),
       ], ['categorical_x', 'string_y'])),
       ('slice1', pa.Table.from_arrays([
           pa.array([['a'], ['a'], ['b'], ['a']]),
           pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
       ], ['categorical_x', 'string_y'])),
       ('slice2', pa.Table.from_arrays([
           pa.array([None, None, None, None], type=pa.null()),
           pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
       ], ['categorical_x', 'string_y'])),
   ]
   schema = text_format.Parse(
       """
       feature {
         name: 'categorical_x'
         type: BYTES
       }
       feature {
         name: 'string_y'
         type: BYTES
       }
       """, schema_pb2.Schema())
   expected_result = [
       ('slice1',
        text_format.Parse(
            """
           cross_features {
             path_x {
               step: "categorical_x"
             }
             path_y {
               step: "string_y"
             }
             categorical_cross_stats {
               lift {
                 lift_series {
                   y_string: "cat"
                   y_count: 4
                   lift_values {
                     x_string: "b"
                     lift: 2.0
                     x_count: 2
                     x_and_y_count: 2
                   }
                   lift_values {
                     x_string: "a"
                     lift: 0.6666667
                     x_count: 6
                     x_and_y_count: 2
                   }
                 }
                 lift_series {
                   y_string: "dog"
                   y_count: 4
                   lift_values {
                     x_string: "a"
                     lift: 1.3333333
                     x_count: 6
                     x_and_y_count: 4
                   }
                   lift_values {
                     x_string: "b"
                     lift: 0.0
                     x_count: 2
                     x_and_y_count: 0
                   }
                 }
               }
             }
           }""", statistics_pb2.DatasetFeatureStatistics())),
       ('slice2',
        text_format.Parse(
            """
           cross_features {
             path_x {
               step: "categorical_x"
             }
             path_y {
               step: "string_y"
             }
             categorical_cross_stats {
               lift {
                 lift_series {
                   y_string: "cat"
                   y_count: 3
                   lift_values {
                     x_string: "a"
                     lift: 0.7777778
                     x_count: 3
                     x_and_y_count: 1
                   }
                 }
                 lift_series {
                   y_string: "dog"
                   y_count: 4
                   lift_values {
                     x_string: "a"
                     lift: 1.1666667
                     x_count: 3
                     x_and_y_count: 2
                   }
                 }
               }
             }
           }""", statistics_pb2.DatasetFeatureStatistics())),
   ]
   generator = lift_stats_generator.LiftStatsGenerator(
       schema=schema, y_path=types.FeaturePath(['string_y']))
   self.assertSlicingAwareTransformOutputEqual(
       examples,
       generator,
       expected_result)
コード例 #40
0
     secondary_delimiter='|',
     expected_result=[
         pa.RecordBatch.from_arrays([
             pa.array([[1, 2.3]], pa.list_(pa.float32())),
             pa.array([[b'test']], pa.list_(pa.binary()))
         ], ['multivalent_feature', 'test_feature'])
     ]),
 dict(
     testcase_name='empty_multivalent_column',
     input_lines=['|,test'],
     column_names=['empty_feature', 'test_feature'],
     multivalent_columns_names=['empty_feature'],
     secondary_delimiter='|',
     expected_result=[
         pa.RecordBatch.from_arrays([
             pa.array([None], pa.null()),
             pa.array([[b'test']], pa.list_(pa.binary()))
         ], ['empty_feature', 'test_feature'])
     ]),
 dict(
     testcase_name='empty_string_multivalent_column',
     input_lines=['|,test', 'a|b,test'],
     column_names=['string_feature', 'test_feature'],
     multivalent_columns_names=['string_feature'],
     secondary_delimiter='|',
     expected_result=[
         pa.RecordBatch.from_arrays([
             pa.array([[b'', b''], [b'a', b'b']], pa.list_(pa.binary())),
             pa.array([[b'test'], [b'test']], pa.list_(pa.binary()))
         ], ['string_feature', 'test_feature'])
     ]),
コード例 #41
0
          [pa.array([None, [1]], pa.list_(pa.int64()))], ['f1'])),
 dict(
     testcase_name='empty_csv',
     input_lines=[],
     column_names=['f1'],
     expected_csv_cells=[],
     expected_types=[csv_decoder.ColumnType.UNKNOWN],
     expected_record_batch=[],
 ),
 dict(testcase_name='null_column',
      input_lines=['', ''],
      column_names=['f1'],
      expected_csv_cells=[[], []],
      expected_types=[csv_decoder.ColumnType.UNKNOWN],
      expected_record_batch=pa.RecordBatch.from_arrays(
          [pa.array([None, None], pa.null())], ['f1'])),
 dict(testcase_name='size_2_vector_int_multivalent',
      input_lines=['12|14'],
      column_names=['x'],
      expected_csv_cells=[[b'12|14']],
      expected_types=[csv_decoder.ColumnType.INT],
      expected_record_batch=pa.RecordBatch.from_arrays(
          [pa.array([[12, 14]], pa.list_(pa.int64()))], ['x']),
      delimiter=' ',
      multivalent_columns=['x'],
      secondary_delimiter='|'),
 dict(testcase_name='space_and_comma_delimiter',
      input_lines=['1,2 "abcdef"', '5,1 "wxxyyz"'],
      column_names=['f1', 'f2'],
      expected_csv_cells=[[b'1,2', b'abcdef'], [b'5,1', b'wxxyyz']],
      expected_types=[
コード例 #42
0
ファイル: test_jvm.py プロジェクト: emkornfield/arrow
    else:
        return schema_cls(fields)


# In the following, we use the JSON serialization of the Field objects in Java.
# This ensures that we neither rely on the exact mechanics on how to construct
# them using Java code as well as enables us to define them as parameters
# without to invoke the JVM.
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize('pa_type,jvm_spec', [
    (pa.null(), '{"name":"null"}'),
    (pa.bool_(), '{"name":"bool"}'),
    (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
    (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
    (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
    (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
    (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
    (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
    (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
    (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
    (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
    (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
    (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
    (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
    (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
    (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
コード例 #43
0
def _determine_schemas_to_compare(schemas, ignore_pandas):
    """
    Iterate over a list of `pyarrow.Schema` objects and prepares them for comparison by picking a reference
    and determining all null columns.

    .. note::

        If pandas metadata exists, the version stored in the metadata is overwritten with the currently
        installed version since we expect to stay backwards compatible

    Returns
    -------
    reference: Schema
        A reference schema which is picked from the input list. The reference schema is guaranteed
        to be a schema having the least number of null columns of all input columns. The set of null
        columns is guaranteed to be a true subset of all null columns of all input schemas. If no such
        schema can be found, an Exception is raised
    list_of_schemas: List[Tuple[Schema, List]]
        A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and
        must be removed before comparing the schemas
    """
    has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas
    schemas_to_evaluate = []
    reference = None
    null_cols_in_reference = set()

    for schema in schemas:
        if not isinstance(schema, SchemaWrapper):
            schema = SchemaWrapper(schema, "__unknown__")

        if has_pandas:
            metadata = schema.metadata
            if metadata is None or b"pandas" not in metadata:
                raise ValueError(
                    "Pandas and non-Pandas schemas are not comparable. "
                    "Use ignore_pandas=True if you only want to compare "
                    "on Arrow level.")
            pandas_metadata = load_json(metadata[b"pandas"].decode("utf8"))

            # we don't care about the pandas version, since we assume it's safe
            # to read datasets that were written by older or newer versions.
            pandas_metadata["pandas_version"] = "{}".format(pd.__version__)

            metadata_clean = deepcopy(metadata)
            metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata)
            current = SchemaWrapper(pa.schema(schema, metadata_clean),
                                    schema.origin)
        else:
            current = schema

        # If a field is null we cannot compare it and must therefore reject it
        null_columns = {
            field.name
            for field in current if field.type == pa.null()
        }

        # Determine a valid reference schema. A valid reference schema is considered to be the schema
        # of all input schemas with the least empty columns.
        # The reference schema ought to be a schema whose empty columns are a true subset for all sets
        # of empty columns. This ensures that the actual reference schema is the schema with the most
        # information possible. A schema which doesn't fulfil this requirement would weaken the
        # comparison and would allow for false positives

        # Trivial case
        if reference is None:
            reference = current
            null_cols_in_reference = null_columns
        # The reference has enough information to validate against current schema.
        # Append it to the list of schemas to be verified
        elif null_cols_in_reference.issubset(null_columns):
            schemas_to_evaluate.append((current, null_columns))
        # current schema includes all information of reference and more.
        # Add reference to schemas_to_evaluate and update reference
        elif null_columns.issubset(null_cols_in_reference):
            schemas_to_evaluate.append((reference, null_cols_in_reference))
            reference = current
            null_cols_in_reference = null_columns
        # If there is no clear subset available elect the schema with the least null columns as `reference`.
        # Iterate over the null columns of `reference` and replace it with a non-null field of the `current`
        # schema which recovers the loop invariant (null columns of `reference` is subset of `current`)
        else:
            if len(null_columns) < len(null_cols_in_reference):
                reference, current = current, reference
                null_cols_in_reference, null_columns = (
                    null_columns,
                    null_cols_in_reference,
                )

            for col in null_cols_in_reference - null_columns:
                # Enrich the information in the reference by grabbing the missing fields
                # from the current iteration. This assumes that we only check for global validity and
                # isn't relevant where the reference comes from.
                reference = _swap_fields_by_name(reference, current, col)
                null_cols_in_reference.remove(col)
            schemas_to_evaluate.append((current, null_columns))

    assert (reference is not None) or (not schemas_to_evaluate)

    return reference, schemas_to_evaluate
コード例 #44
0
 def test_empty_list(self):
     arr = pyarrow.from_pylist([])
     assert len(arr) == 0
     assert arr.null_count == 0
     assert arr.type == pyarrow.null()
 def test_topk_uniques_combiner_with_categorical_feature(self):
     # fa: 4 12, 2 23, 2 34, 2 45
     batches = [
         pa.RecordBatch.from_arrays(
             [pa.array([[12, 23, 34, 12], [45, 23]])], ['fa']),
         pa.RecordBatch.from_arrays([pa.array([[12, 12, 34, 45]])], ['fa']),
         pa.RecordBatch.from_arrays(
             [pa.array([None, None, None, None], type=pa.null())], ['fa']),
     ]
     expected_result = {
         types.FeaturePath(['fa']):
         text_format.Parse(
             """
             path {
               step: 'fa'
             }
             type: INT
             string_stats {
               unique: 4
               top_values {
                 value: '12'
                 frequency: 4
               }
               top_values {
                 value: '45'
                 frequency: 2
               }
               top_values {
                 value: '34'
                 frequency: 2
               }
               top_values {
                 value: '23'
                 frequency: 2
               }
               rank_histogram {
                 buckets {
                   low_rank: 0
                   high_rank: 0
                   label: "12"
                   sample_count: 4.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "45"
                   sample_count: 2.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "34"
                   sample_count: 2.0
                 }
               }
           }""", statistics_pb2.FeatureNameStatistics())
     }
     schema = text_format.Parse(
         """
     feature {
       name: "fa"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = (top_k_uniques_combiner_stats_generator.
                  TopKUniquesCombinerStatsGenerator(
                      schema=schema,
                      num_top_values=4,
                      num_rank_histogram_buckets=3))
     self.assertCombinerOutputEqual(batches, generator, expected_result)
コード例 #46
0
 dict(
     testcase_name="without_schema_first_example_typed",
     schema_text_proto=None,
     sequence_examples_text_proto=[
         _TYPED_SEQUENCE_EXAMPLE, _UNTYPED_SEQUENCE_EXAMPLE,
         _SOME_FEATURES_TYPED_SEQUENCE_EXAMPLE,
         _EMPTY_VALUES_LIST_SEQUENCE_EXAMPLE
     ],
     create_expected=lambda list_factory, binary_type: pa.RecordBatch.
     from_arrays([
         pa.array([[1], None, None, []], type=list_factory(pa.int64())),
         pa.array([[1.0, 2.0], None, None, []],
                  type=list_factory(pa.float32())),
         pa.array([[b"a", b"b", b"c"], None, None, []],
                  type=list_factory(binary_type)),
         pa.array([None, None, None, None], pa.null()),
         pa.array([None, None, [1.0], None],
                  type=list_factory(pa.float32())),
         pa.StructArray.from_arrays(
             [
                 pa.array([None, None, [[1.0]], None],
                          type=list_factory(list_factory(pa.float32()))),
                 pa.array([[[1, 2], [3]], [], [None, None, None], [[], []]],
                          type=list_factory(list_factory(pa.int64()))),
                 pa.array([[[3.0, 4.0], [1.0, 2.0]], [], [None], [[]]],
                          type=list_factory(list_factory(pa.float32()))),
                 pa.array([[[b"a", b"b"], [b"c"]], [], [None], [[]]],
                          type=list_factory(list_factory(binary_type)))
             ],
             names=["sequence_v", "sequence_x", "sequence_y", "sequence_z"])
     ], [
コード例 #47
0
ファイル: test_types.py プロジェクト: CodingCat/arrow
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import pickle

import pytest

import pyarrow as pa
import pyarrow.types as types


MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
コード例 #48
0
ファイル: test_convert_builtin.py プロジェクト: dremio/arrow
def test_empty_list(seq):
    arr = pa.array(seq([]))
    assert len(arr) == 0
    assert arr.null_count == 0
    assert arr.type == pa.null()
    assert arr.to_pylist() == []
コード例 #49
0
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
コード例 #50
0
ファイル: test_convert_builtin.py プロジェクト: dremio/arrow
def test_empty_iterable():
    arr = pa.array(StrangeIterable([]))
    assert len(arr) == 0
    assert arr.null_count == 0
    assert arr.type == pa.null()
    assert arr.to_pylist() == []
コード例 #51
0
    old_allocation = pa.total_allocated_bytes()
    try:
        yield
    finally:
        assert pa.total_allocated_bytes() == old_allocation


@pytest.fixture(autouse=True)
def assert_pyarrow_leak():
    # automatically applied to all test cases
    with no_pyarrow_leak():
        yield


_supported_pyarrow_types = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32("s"),
    pa.time64("us"),
    pa.date32(),
    pa.timestamp("us"),
    pa.timestamp("us", tz="UTC"),
    pa.timestamp("us", tz="Europe/Paris"),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.large_string(),
コード例 #52
0
ファイル: test_convert_builtin.py プロジェクト: apache/arrow
 def test_empty_list(self):
     arr = pyarrow.from_pylist([])
     assert len(arr) == 0
     assert arr.null_count == 0
     assert arr.type == pyarrow.null()
     assert arr.to_pylist() == []
コード例 #53
0
ファイル: test_types.py プロジェクト: sfc-gh-hyu/arrow
def test_is_boolean_value():
    assert not pa.types.is_boolean_value(1)
    assert pa.types.is_boolean_value(True)
    assert pa.types.is_boolean_value(False)
    assert pa.types.is_boolean_value(np.bool_(True))
    assert pa.types.is_boolean_value(np.bool_(False))


@h.given(
    past.all_types |
    past.all_fields |
    past.all_schemas
)
@h.example(
    pa.field(name='', type=pa.null(), metadata={'0': '', '': ''})
)
def test_pickling(field):
    data = pickle.dumps(field)
    assert pickle.loads(data) == field


@h.given(
    st.lists(past.all_types) |
    st.lists(past.all_fields) |
    st.lists(past.all_schemas)
)
def test_hashing(items):
    h.assume(
        # well, this is still O(n^2), but makes the input unique
        all(not a.equals(b) for i, a in enumerate(items) for b in items[:i])
コード例 #54
0
ファイル: strategies.py プロジェクト: SofyanS/CH_redact
import hypothesis.extra.numpy as npst
import hypothesis.extra.pytz as tzst
import numpy as np

import pyarrow as pa


# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(
        min_codepoint=0x41,
        max_codepoint=0x7E
    )
)

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())

signed_integer_types = st.sampled_from([
    pa.int8(),
    pa.int16(),
    pa.int32(),
    pa.int64()
])
unsigned_integer_types = st.sampled_from([
    pa.uint8(),
コード例 #55
0
 def test_all_none(self):
     arr = pyarrow.from_pylist([None, None])
     assert len(arr) == 2
     assert arr.null_count == 2
     assert arr.type == pyarrow.null()