Esempio n. 1
0
def test_table_safe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])
    casted_table = table.cast(target_schema)

    assert casted_table.equals(expected_table)
Esempio n. 2
0
def test_sequence_nesting_levels():
    data = [1, 2, None]
    arr = pa.array(data)
    assert arr.type == pa.int64()
    assert arr.to_pylist() == data

    data = [[1], [2], None]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data

    data = [[1], [2, 3, 4], [None]]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data

    data = [None, [[None, 1]], [[2, 3, 4], None], [None]]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.list_(pa.int64()))
    assert arr.to_pylist() == data

    exceptions = (pa.ArrowInvalid, pa.ArrowTypeError)

    # Mixed nesting levels are rejected
    with pytest.raises(exceptions):
        pa.array([1, 2, [1]])

    with pytest.raises(exceptions):
        pa.array([1, 2, []])

    with pytest.raises(exceptions):
        pa.array([[1], [2], [None, [1]]])
Esempio n. 3
0
def test_table_unsafe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        table.cast(target_schema)

    casted_table = table.cast(target_schema, safe=False)
    assert casted_table.equals(expected_table)
Esempio n. 4
0
def test_struct_type():
    fields = [pa.field('a', pa.int64()),
              pa.field('a', pa.int32()),
              pa.field('b', pa.int32())]
    ty = pa.struct(fields)

    assert len(ty) == ty.num_children == 3
    assert list(ty) == fields

    for a, b in zip(ty, fields):
        a == b

    # Construct from list of tuples
    ty = pa.struct([('a', pa.int64()),
                    ('a', pa.int32()),
                    ('b', pa.int32())])
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b

    # Construct from mapping
    fields = [pa.field('a', pa.int64()),
              pa.field('b', pa.int32())]
    ty = pa.struct(OrderedDict([('a', pa.int64()),
                                ('b', pa.int32())]))
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Esempio n. 6
0
def test_struct_type():
    fields = [
        # Duplicate field name on purpose
        pa.field('a', pa.int64()),
        pa.field('a', pa.int32()),
        pa.field('b', pa.int32())
    ]
    ty = pa.struct(fields)

    assert len(ty) == ty.num_children == 3
    assert list(ty) == fields
    assert ty[0].name == 'a'
    assert ty[2].type == pa.int32()
    with pytest.raises(IndexError):
        assert ty[3]

    assert ty['b'] == ty[2]

    # Duplicate
    with pytest.warns(UserWarning):
        with pytest.raises(KeyError):
            ty['a']

    # Not found
    with pytest.raises(KeyError):
        ty['c']

    # Neither integer nor string
    with pytest.raises(TypeError):
        ty[None]

    for a, b in zip(ty, fields):
        a == b

    # Construct from list of tuples
    ty = pa.struct([('a', pa.int64()),
                    ('a', pa.int32()),
                    ('b', pa.int32())])
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b

    # Construct from mapping
    fields = [pa.field('a', pa.int64()),
              pa.field('b', pa.int32())]
    ty = pa.struct(OrderedDict([('a', pa.int64()),
                                ('b', pa.int32())]))
    assert list(ty) == fields
    for a, b in zip(ty, fields):
        a == b

    # Invalid args
    with pytest.raises(TypeError):
        pa.struct([('a', None)])
Esempio n. 7
0
def test_fields_hashable():
    in_dict = {}
    fields = [pa.field('a', pa.int32()),
              pa.field('a', pa.int64()),
              pa.field('a', pa.int64(), nullable=False),
              pa.field('b', pa.int32()),
              pa.field('b', pa.int32(), nullable=False)]
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i
Esempio n. 8
0
def dataframe_with_lists(include_index=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Esempio n. 9
0
 def test_simple_ints(self):
     # Infer integer columns
     rows = b"a,b,c\n1,2,3\n4,5,6\n"
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.int64()),
                         ('c', pa.int64())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1, 4],
         'b': [2, 5],
         'c': [3, 6],
         }
Esempio n. 10
0
def test_table_from_arrays_preserves_column_metadata():
    # Added to test https://issues.apache.org/jira/browse/ARROW-3866
    arr0 = pa.array([1, 2])
    arr1 = pa.array([3, 4])
    field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B"))
    field1 = pa.field('field2', pa.int64(), nullable=False)
    columns = [
        pa.column(field0, arr0),
        pa.column(field1, arr1)
    ]
    table = pa.Table.from_arrays(columns)
    assert b"a" in table.column(0).field.metadata
    assert table.column(1).field.nullable is False
Esempio n. 11
0
 def test_simple_ints(self):
     # Infer integer columns
     rows = b'{"a": 1,"b": 2, "c": 3}\n{"a": 4,"b": 5, "c": 6}\n'
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.int64()),
                         ('c', pa.int64())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1, 4],
         'b': [2, 5],
         'c': [3, 6],
         }
Esempio n. 12
0
def test_list_array_flatten():
    typ2 = pa.list_(
        pa.list_(
            pa.int64()
        )
    )
    arr2 = pa.array([
        None,
        [
            [1, None, 2],
            None,
            [3, 4]
        ],
        [],
        [
            [],
            [5, 6],
            None
        ],
        [
            [7, 8]
        ]
    ])
    assert arr2.type.equals(typ2)

    typ1 = pa.list_(pa.int64())
    arr1 = pa.array([
        [1, None, 2],
        None,
        [3, 4],
        [],
        [5, 6],
        None,
        [7, 8]
    ])
    assert arr1.type.equals(typ1)

    typ0 = pa.int64()
    arr0 = pa.array([
        1, None, 2,
        3, 4,
        5, 6,
        7, 8
    ])
    assert arr0.type.equals(typ0)

    assert arr2.flatten().equals(arr1)
    assert arr1.flatten().equals(arr0)
    assert arr2.flatten().flatten().equals(arr0)
Esempio n. 13
0
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
Esempio n. 14
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Esempio n. 15
0
def test_iterator_without_size():
    expected = pa.array((0, 1, 2))
    arr1 = pa.array(iter(range(3)))
    assert arr1.equals(expected)
    # Same with explicit type
    arr1 = pa.array(iter(range(3)), type=pa.int64())
    assert arr1.equals(expected)
Esempio n. 16
0
 def test_list_of_int(self):
     data = [[1, 2, 3], [], None, [1, 2]]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pyarrow.list_(pyarrow.int64())
     assert arr.to_pylist() == data
Esempio n. 17
0
def test_sequence_integer_inferred(seq):
    expected = [1, None, 3, None]
    arr = pa.array(seq(expected))
    assert len(arr) == 4
    assert arr.null_count == 2
    assert arr.type == pa.int64()
    assert arr.to_pylist() == expected
Esempio n. 18
0
 def test_integer(self):
     expected = [1, None, 3, None]
     arr = pyarrow.from_pylist(expected)
     assert len(arr) == 4
     assert arr.null_count == 2
     assert arr.type == pyarrow.int64()
     assert arr.to_pylist() == expected
Esempio n. 19
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Esempio n. 20
0
def test_nested_arrays(seq):
    arr = pa.array(seq([np.array([], dtype=np.int64),
                        np.array([1, 2], dtype=np.int64), None]))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == [[], [1, 2], None]
Esempio n. 21
0
def test_asarray():
    arr = pa.array(range(4))

    # The iterator interface gives back an array of Int64Value's
    np_arr = np.asarray([_ for _ in arr])
    assert np_arr.tolist() == [0, 1, 2, 3]
    assert np_arr.dtype == np.dtype('O')
    assert type(np_arr[0]) == pa.lib.Int64Value

    # Calling with the arrow array gives back an array with 'int64' dtype
    np_arr = np.asarray(arr)
    assert np_arr.tolist() == [0, 1, 2, 3]
    assert np_arr.dtype == np.dtype('int64')

    # An optional type can be specified when calling np.asarray
    np_arr = np.asarray(arr, dtype='str')
    assert np_arr.tolist() == ['0', '1', '2', '3']

    # If PyArrow array has null values, numpy type will be changed as needed
    # to support nulls.
    arr = pa.array([0, 1, 2, None])
    assert arr.type == pa.int64()
    np_arr = np.asarray(arr)
    elements = np_arr.tolist()
    assert elements[:3] == [0., 1., 2.]
    assert np.isnan(elements[3])
    assert np_arr.dtype == np.dtype('float64')
Esempio n. 22
0
def test_buffers_nested():
    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
    buffers = a.buffers()
    assert len(buffers) == 4
    # The parent buffers
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    offsets = buffers[1].to_pybytes()
    assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
    # The child buffers
    null_bitmap = buffers[2].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00110111
    values = buffers[3].to_pybytes()
    assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)

    a = pa.array([(42, None), None, (None, 43)],
                 type=pa.struct([pa.field('a', pa.int8()),
                                 pa.field('b', pa.int16())]))
    buffers = a.buffers()
    assert len(buffers) == 5
    # The parent buffer
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    # The child buffers: 'a'
    null_bitmap = buffers[1].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000001
    values = buffers[2].to_pybytes()
    assert struct.unpack('bxx', values) == (42,)
    # The child buffers: 'b'
    null_bitmap = buffers[3].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000100
    values = buffers[4].to_pybytes()
    assert struct.unpack('4xh', values) == (43,)
Esempio n. 23
0
File: jvm.py Progetto: rok/arrow
def _from_jvm_int_type(jvm_type):
    """
    Convert a JVM int type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int

    Returns
    -------
    typ: pyarrow.DataType
    """
    if jvm_type.isSigned:
        if jvm_type.bitWidth == 8:
            return pa.int8()
        elif jvm_type.bitWidth == 16:
            return pa.int16()
        elif jvm_type.bitWidth == 32:
            return pa.int32()
        elif jvm_type.bitWidth == 64:
            return pa.int64()
    else:
        if jvm_type.bitWidth == 8:
            return pa.uint8()
        elif jvm_type.bitWidth == 16:
            return pa.uint16()
        elif jvm_type.bitWidth == 32:
            return pa.uint32()
        elif jvm_type.bitWidth == 64:
            return pa.uint64()
Esempio n. 24
0
    def test_custom_nulls(self):
        # Infer nulls with custom values
        opts = ConvertOptions(null_values=['Xxx', 'Zzz'])
        rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.null()),
                            ('b', pa.string()),
                            ('c', pa.string()),
                            ('d', pa.int64())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [None, None],
            'b': [u"Xxx", u"#N/A"],
            'c': [u"1", u""],
            'd': [2, None],
            }

        opts = ConvertOptions(null_values=[])
        rows = b"a,b\n#N/A,\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.string()),
                            ('b', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [u"#N/A"],
            'b': [u""],
            }
Esempio n. 25
0
def test_infinite_iterator():
    expected = pa.array((0, 1, 2))
    arr1 = pa.array(itertools.count(0), size=3)
    assert arr1.equals(expected)
    # Same with explicit type
    arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3)
    assert arr1.equals(expected)
Esempio n. 26
0
def test_struct_from_dicts_inference():
    expected_type = pa.struct([pa.field('a', pa.int64()),
                               pa.field('b', pa.string()),
                               pa.field('c', pa.bool_())])
    data = [{'a': 5, 'b': u'foo', 'c': True},
            {'a': 6, 'b': u'bar', 'c': False}]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True},
            None,
            {},
            {'a': None, 'b': u'bar'}]
    expected = [{'a': 5, 'b': None, 'c': True},
                None,
                {'a': None, 'b': None, 'c': None},
                {'a': None, 'b': u'bar', 'c': None}]
    arr = pa.array(data)
    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data)

    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == expected
    assert arr.equals(arr2)

    # Nested
    expected_type = pa.struct([
        pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())),
                                 pa.field('ab', pa.bool_())])),
        pa.field('b', pa.string())])
    data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
            {'a': {'aa': None, 'ab': False}, 'b': None},
            {'a': None, 'b': 'bar'}]
    arr = pa.array(data)
    assert arr.to_pylist() == data

    # Edge cases
    arr = pa.array([{}])
    assert arr.type == pa.struct([])
    assert arr.to_pylist() == [{}]

    # Mixing structs and scalars is rejected
    with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
        pa.array([1, {'a': 2}])
 def test_int_object_nulls(self):
     arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
     df = pd.DataFrame({'ints': arr})
     expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
     field = pa.field('ints', pa.int64())
     schema = pa.schema([field])
     self._check_pandas_roundtrip(df, expected=expected,
                                  expected_schema=schema)
Esempio n. 28
0
def test_empty_table():
    schema = pa.schema([
        pa.field('oneField', pa.int64())
    ])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema
    def test_infer_lists(self):
        data = OrderedDict([
            ('nan_ints', [[None, 1], [2, 3]]),
            ('ints', [[0, 1], [2, 3]]),
            ('strs', [[None, u'b'], [u'c', u'd']]),
            ('nested_strs', [[[None, u'b'], [u'c', u'd']], None])
        ])
        df = pd.DataFrame(data)

        expected_schema = pa.schema([
            pa.field('nan_ints', pa.list_(pa.int64())),
            pa.field('ints', pa.list_(pa.int64())),
            pa.field('strs', pa.list_(pa.string())),
            pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
        ])

        self._check_pandas_roundtrip(df, expected_schema=expected_schema)
Esempio n. 30
0
def test_chunked_array_equals():
    def eq(xarrs, yarrs):
        if isinstance(xarrs, pa.ChunkedArray):
            x = xarrs
        else:
            x = pa.chunked_array(xarrs)
        if isinstance(yarrs, pa.ChunkedArray):
            y = yarrs
        else:
            y = pa.chunked_array(yarrs)
        assert x.equals(y)
        assert y.equals(x)
        assert x == y
        assert x != str(y)

    def ne(xarrs, yarrs):
        if isinstance(xarrs, pa.ChunkedArray):
            x = xarrs
        else:
            x = pa.chunked_array(xarrs)
        if isinstance(yarrs, pa.ChunkedArray):
            y = yarrs
        else:
            y = pa.chunked_array(yarrs)
        assert not x.equals(y)
        assert not y.equals(x)
        assert x != y

    eq(pa.chunked_array([], type=pa.int32()),
       pa.chunked_array([], type=pa.int32()))
    ne(pa.chunked_array([], type=pa.int32()),
       pa.chunked_array([], type=pa.int64()))

    a = pa.array([0, 2], type=pa.int32())
    b = pa.array([0, 2], type=pa.int64())
    c = pa.array([0, 3], type=pa.int32())
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)
Esempio n. 31
0
    assert all_op(arrow, skipna) == pandas.all(skipna=skipna)

    # Split in the middle and check whether this still works
    if len(data) > 2:
        arrow = pa.chunked_array(
            [data[: len(data) // 2], data[len(data) // 2 :]], type=pa.bool_()
        )
        assert all_op(arrow, skipna) == pandas.all(skipna=skipna)


@pytest.mark.parametrize(
    ("array", "fill_null_value", "expected"),
    [
        (pa.array([2, 1], type=pa.int16()), -1, np.array([2, 1], dtype=np.int16)),
        (pa.array([2, None], type=pa.int32()), -1, np.array([2, -1], dtype=np.int32)),
        (pa.array([2, None], type=pa.int64()), -1.5, np.array([2, -1], dtype=np.int64)),
        (pa.array([1, None], type=pa.uint8()), 257, np.array([1, 1], dtype=np.uint8)),
        (pa.array([None, None], type=pa.int8()), 5, np.array([5, 5], dtype=np.int8)),
        (pa.array([], type=pa.int8()), 5, np.array([], dtype=np.int8)),
    ],
)
def test_integer_array_to_numpy(array, fill_null_value, expected):
    actual = integer_array_to_numpy(array, fill_null_value)
    assert actual.dtype == expected.dtype
    np.testing.assert_array_equal(actual, expected)


@pytest.mark.parametrize(
    ("array", "indices"),
    [
        (
Esempio n. 32
0
def test_in_chunk_offsets(data: List[List[int]]):
    arr = pa.chunked_array(data, type=pa.int64())
    # Simple case: Passing in the actual chunk offsets should yield a valid selection
    offsets = list(_calculate_chunk_offsets(arr))
    in_offsets = _in_chunk_offsets(arr, offsets)
    check_valid_in_offsets(arr, in_offsets)
def test_iterate_over_decimal_chunk(): 
    random.seed(datetime.datetime.now())
    precision = random.randint(1, 38)
    scale = random.randint(0, precision)
    datatype = None
    if precision <= 2:
        datatype = pyarrow.int8()
    elif precision <= 4:
        datatype = pyarrow.int16()
    elif precision <= 9:
        datatype = pyarrow.int32()
    elif precision <= 19:
        datatype = pyarrow.int64()
    else:
        datatype = pyarrow.decimal128(precision, scale)

    def decimal_generator(_precision, _scale):
        def decimal128_generator(precision, scale):
            data = []
            for i in range(precision):
                data.append(str(random.randint(0, 9)))

            if scale:
                data.insert(-scale, '.')
            return decimal.Decimal("".join(data))

        def int64_generator(precision):
            data = random.randint(-9223372036854775808, 9223372036854775807)
            return int(str(data)[:precision if data >= 0 else precision + 1])
            
        def int32_generator(precision):
            data = random.randint(-2147483648, 2147483637)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int16_generator(precision):
            data = random.randint(-32768, 32767)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int8_generator(precision):
            data = random.randint(-128, 127)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        if _precision <= 2:
            return int8_generator(_precision)
        elif _precision <= 4:
            return int16_generator(_precision)
        elif _precision <= 9:
            return int32_generator(_precision)
        elif _precision <= 19:
            return int64_generator(_precision)
        else:
            return decimal128_generator(_precision, _scale)

    def expected_data_transform_decimal(_precision, _scale):
        def expected_data_transform_decimal_impl(data, precision=_precision, scale=_scale):
            if precision <= 19:
                return decimal.Decimal(data).scaleb(-scale)
            else:
                return data

        return expected_data_transform_decimal_impl 

    column_meta = { "logicalType" : "FIXED", "precision" : str(precision), "scale" : str(scale) }
    iterate_over_test_chunk([datatype, datatype], [column_meta, column_meta],
        lambda: decimal_generator(precision, scale), expected_data_transform_decimal(precision, scale))
def test_limited_iterator_types():
    arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
    arr2 = pa.array((0, 1, 2))
    assert arr1.equals(arr2)
Esempio n. 35
0
 def __init__(self):
     pa.PyExtensionType.__init__(self, pa.int64())
def test_column_type_int64_same():
    table = pyarrow.table(
        {"A": pyarrow.array([1, 2, -1, None, 3, None, 1], pyarrow.int64())})
    assert_arrow_table_identity(table)
Esempio n. 37
0
    b = pa.array([0, 2], type=pa.int64())
    c = pa.array([0, 3], type=pa.int32())
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    array.validate()
    result = pickle.loads(pickle.dumps(array))
    result.validate()
 def __init__(self, freq):
     # attributes need to be set first before calling
     # super init (as that calls serialize)
     self._freq = freq
     pa.ExtensionType.__init__(self, pa.int64(), 'pandas.period')
def test_generic_ext_type():
    period_type = PeriodType('D')
    assert period_type.extension_name == "pandas.period"
    assert period_type.storage_type == pa.int64()
Esempio n. 40
0
# fmt:on

PANDAS_GE_0_26_0 = LooseVersion(pd.__version__) >= "0.26.0"
if PANDAS_GE_0_26_0:
    from pandas.core.indexers import check_array_indexer

_python_type_map = {
    pa.null().id: six.text_type,
    pa.bool_().id: bool,
    pa.int8().id: int,
    pa.uint8().id: int,
    pa.int16().id: int,
    pa.uint16().id: int,
    pa.int32().id: int,
    pa.uint32().id: int,
    pa.int64().id: int,
    pa.uint64().id: int,
    pa.float16().id: float,
    pa.float32().id: float,
    pa.float64().id: float,
    pa.date32().id: datetime.date,
    pa.date64().id: datetime.date,
    pa.timestamp("ms").id: datetime.datetime,
    pa.binary().id: six.binary_type,
    pa.string().id: six.text_type,
    # Use any list type here, only LIST is important
    pa.list_(pa.string()).id: list,
    pa.large_list(pa.string()).id: list,
}

_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}
Esempio n. 41
0
        def __arrow_ext_serialize__(self):
            metadata = {"subtype": str(self.subtype), "closed": self.closed}
            return json.dumps(metadata).encode()

        @classmethod
        def __arrow_ext_deserialize__(cls, storage_type, serialized):
            metadata = json.loads(serialized.decode())
            subtype = pyarrow.type_for_alias(metadata["subtype"])
            closed = metadata["closed"]
            return ArrowIntervalType(subtype, closed)

        def __eq__(self, other):
            if isinstance(other, pyarrow.BaseExtensionType):
                return (type(self) == type(other)
                        and self.subtype == other.subtype
                        and self.closed == other.closed)
            else:
                return NotImplemented

        def __hash__(self):
            return hash((str(self), str(self.subtype), self.closed))

        def to_pandas_dtype(self):
            import pandas as pd

            return pd.IntervalDtype(self.subtype.to_pandas_dtype())

    # register the type with a dummy instance
    _interval_type = ArrowIntervalType(pyarrow.int64(), "left")
    pyarrow.register_extension_type(_interval_type)
from pyarrow.compat import unittest  # noqa
from pyarrow.pandas_compat import _pandas_api  # noqa
import pyarrow as pa

import collections
import datetime
import decimal
import itertools
import math
import traceback

import numpy as np
import pytz

int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int16()),
                  (np.int32, pa.int32()), (np.int64, pa.int64()),
                  (np.uint8, pa.uint8()), (np.uint16, pa.uint16()),
                  (np.uint32, pa.uint32()), (np.uint64, pa.uint64())]

np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()


class MyInt:
    def test_get_pyarrow_translated_schema(self):

        string_input_schema = [{
            "type": "STRING",
            "name": "string1",
            "mode": "REQUIRED"
        }, {
            "type": "NUMERIC",
            "name": "numeric1",
            "mode": "NULLABLE"
        }, {
            "type": "INTEGER",
            "name": "integer1",
            "mode": "REQUIRED"
        }, {
            "type": "FLOAT",
            "name": "float1",
            "mode": "NULLABLE"
        }, {
            "type": "BOOLEAN",
            "name": "boolean1",
            "mode": "REQUIRED"
        }, {
            "type": "TIMESTAMP",
            "name": "timestamp1",
            "mode": "REQUIRED"
        }, {
            "type": "DATE",
            "name": "date1",
            "mode": "REQUIRED"
        }, {
            "type": "TIME",
            "name": "time1",
            "mode": "REQUIRED"
        }, {
            "type": "DATETIME",
            "name": "datetime1",
            "mode": "REQUIRED"
        }, {
            "type":
            "RECORD",
            "name":
            "record1",
            "mode":
            "REPEATED",
            "fields": [{
                "type": "BOOLEAN",
                "name": "boolean1",
                "mode": "REQUIRED"
            }, {
                "type": "TIMESTAMP",
                "name": "timestamp1",
                "mode": "REQUIRED"
            }]
        }]
        expected_pa_schema = pa.schema([
            pa.field(name='string1', type=pa.string()
                     #nullable=False
                     ),
            pa.field(name='numeric1', type=pa.int64()
                     #nullable=True
                     ),
            pa.field(
                name='integer1',
                type=pa.int64(),
                #nullable=False
            ),
            pa.field(name='float1', type=pa.float64()
                     #nullable=True
                     ),
            pa.field(name='boolean1', type=pa.bool_()
                     #nullable=False
                     ),
            pa.field(name='timestamp1',
                     type=pa.timestamp('us')
                     #nullable=False
                     ),
            pa.field(
                name='date1',
                type=pa.date32(),
                #nullable=False
            ),
            pa.field(name='time1', type=pa.time64('us')
                     #nullable=False
                     ),
            pa.field(name='datetime1',
                     type=pa.timestamp('us')
                     #nullable=False
                     ),
            pa.field(
                name='record1',
                type=pa.list_(
                    pa.struct([
                        pa.field(name='boolean1',
                                 type=pa.bool_()
                                 #nullable=False
                                 ),
                        pa.field(name='timestamp1',
                                 type=pa.timestamp('us')
                                 #nullable=False
                                 )
                    ])))
        ])

        pyarrow_schema = get_pyarrow_translated_schema(string_input_schema)
        self.assertEqual(pyarrow_schema, expected_pa_schema)
Esempio n. 44
0
    def get_pyarrow_schema():
        fields = [
            pa.field('id', pa.int64()),
            pa.field('ts_submit', pa.int64()),
            pa.field('submission_site', pa.int32()),
            pa.field('runtime', pa.int64()),
            pa.field('resource_type', pa.string()),
            pa.field('resource_amount_requested', pa.float64()),
            pa.field('parents', pa.list_(pa.int64())),
            pa.field('children', pa.list_(pa.int64())),
            pa.field('user_id', pa.int32()),
            pa.field('group_id', pa.int32()),
            pa.field('nfrs', pa.string()),
            pa.field('workflow_id', pa.int64()),
            pa.field('wait_time', pa.int64()),
            pa.field('params', pa.string()),
            pa.field('memory_requested', pa.float64()),
            pa.field('network_io_time', pa.int64()),
            pa.field('disk_io_time', pa.int64()),
            pa.field('disk_space_requested', pa.float64()),
            pa.field('energy_consumption', pa.int64()),
            pa.field('resource_used', pa.int64()),
        ]

        return pa.schema(fields)
Esempio n. 45
0
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)


def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
Esempio n. 46
0
def test_concat_tables_with_promotion_error():
    t1 = pa.Table.from_arrays([pa.array([1, 2], type=pa.int64())], ["f"])
    t2 = pa.Table.from_arrays([pa.array([1, 2], type=pa.float32())], ["f"])

    with pytest.raises(pa.ArrowInvalid):
        pa.concat_tables([t1, t2], promote=True)
Esempio n. 47
0
def test_casting_to_extension_type_raises():
    arr = pa.array([1, 2, 3, 4], pa.int64())
    with pytest.raises(pa.ArrowNotImplementedError):
        arr.cast(IntegerType())
def test_list_with_non_list(seq):
    # List types don't accept non-sequences
    with pytest.raises(TypeError):
        pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
    with pytest.raises(TypeError):
        pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
Esempio n. 49
0
    def clean_data_common(self, processed_data, raw_data):
        """Fix the type and default value of of each extracted field

        This routine is common to all services. It ensures that all the missing
        fields, as defined by the schema, are added to the records extracted.
        Furthermore, each field is set to the specified type.
        """

        # Build default data structure
        schema_rec = {}
        def_vals = self._get_default_vals()

        ptype_map = {
            pa.string(): str,
            pa.int32(): int,
            pa.int64(): int,
            pa.float32(): float,
            pa.float64(): float,
            pa.date64(): float,
            pa.list_(pa.string()): list,
            pa.list_(pa.int64()): list,
            pa.bool_(): bool,
        }

        for field in self.schema:
            default = def_vals[field.type]
            schema_rec.update({field.name: default})

        if isinstance(raw_data, list):
            read_from = raw_data[0]
        else:
            read_from = raw_data
        for entry in processed_data:
            entry.update({"hostname": read_from["hostname"]})
            entry.update({"namespace": read_from["namespace"]})
            entry.update({"timestamp": read_from["timestamp"]})
            entry.update({"sqvers": self.version})
            for fld in schema_rec:
                if fld not in entry:
                    if fld == "active":
                        entry.update({fld: True})
                    else:
                        entry.update({fld: schema_rec[fld]})
                else:
                    fld_type = self.schema.field(fld).type
                    if not isinstance(entry[fld], ptype_map[fld_type]):
                        try:
                            entry[fld] = ptype_map[fld_type](entry[fld])
                        except (ValueError, TypeError):
                            entry[fld] = schema_rec[fld]
                    elif isinstance(entry[fld], list):
                        for i, ele in enumerate(entry[fld]):
                            if not isinstance(ele,
                                              ptype_map[fld_type.value_type]):
                                try:
                                    if ptype_map[fld_type.value_type] == int:
                                        entry[fld][i] = int(entry[fld][i])
                                    elif ptype_map[fld_type.value_type] == str:
                                        entry[fld][i] = str(entry[fld][i])
                                    else:
                                        raise ValueError
                                except (ValueError, TypeError):
                                    entry[fld][i] = schema_rec[fld]
        return processed_data
def test_sequence_custom_integers(seq):
    expected = [0, 42, 2**33 + 1, -2**63]
    data = list(map(MyInt, expected))
    arr = pa.array(seq(data), type=pa.int64())
    assert arr.to_pylist() == expected
def test_iterate_over_timestamp_tz_chunk():
    random.seed(datetime.datetime.now())
    scale = random.randint(0, 9)
    column_meta = [
        {"byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale)},
        {"byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale)}
    ]

    type1 = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
              pyarrow.field('timezone', pyarrow.int32()),
              pyarrow.field('fraction', pyarrow.int32())])
    type2 = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
              pyarrow.field('timezone', pyarrow.int32())])
    data_type = type1 if scale > 3 else type2
    
    def timestamp_tz_generator(scale):
        epoch = random.randint(-621355968, 2534023007)
        frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 3 else random.randint(0, 10**scale - 1) 
        timezone = random.randint(1, 2879)
        if scale > 3:
            return {'epoch': epoch, 'timezone': timezone, 'fraction' : frac}
        else:
            epoch = str(epoch)
            frac = str(frac)            
            ZEROFILL = '000000000'
            frac = ZEROFILL[:scale - len(frac)] + frac
            return {'epoch': int(epoch + frac) if scale else int(epoch), 'timezone': timezone}

    def expected_data_transform_tz(_scale):
        def expected_data_transform_tz_impl(data, scale=_scale):
            timezone = data['timezone']
            tzinfo = _generate_tzinfo_from_tzoffset(timezone - 1440)
            epoch = data['epoch']
            if scale > 3:
                frac = data['fraction']
                if epoch < 0:
                    epoch += 1
                    frac = 10**9 - frac
                frac = str(int(frac / 10**(9 - scale)))
                ZERO_FILL = '000000000'
                frac = ZERO_FILL[:scale - len(frac)] + frac
                epoch = int(str(epoch) + frac)

            microsec = str(epoch)
            if scale > 6:
                microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6]
            else:
                microsec = microsec[:-scale] + "." + microsec[-scale:] if scale else microsec

            if platform.system() == 'Windows':
                t = datetime.datetime.utcfromtimestamp(0) + datetime.timedelta(seconds=(float(microsec)))
                if pytz.utc != tzinfo:
                    t += tzinfo.utcoffset(t)
                return t.replace(tzinfo=tzinfo)
            else:
                return datetime.datetime.fromtimestamp(float(microsec), tz=tzinfo)
                
        return expected_data_transform_tz_impl

    iterate_over_test_chunk([data_type, data_type],
        column_meta, lambda: timestamp_tz_generator(scale), expected_data_transform_tz(scale))
def test_struct_from_dicts_inference():
    expected_type = pa.struct([
        pa.field('a', pa.int64()),
        pa.field('b', pa.string()),
        pa.field('c', pa.bool_())
    ])
    data = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': 'bar'}]
    expected = [{
        'a': 5,
        'b': None,
        'c': True
    }, None, {
        'a': None,
        'b': None,
        'c': None
    }, {
        'a': None,
        'b': 'bar',
        'c': None
    }]
    arr = pa.array(data)
    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data)

    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == expected
    assert arr.equals(arr2)

    # Nested
    expected_type = pa.struct([
        pa.field(
            'a',
            pa.struct([
                pa.field('aa', pa.list_(pa.int64())),
                pa.field('ab', pa.bool_())
            ])),
        pa.field('b', pa.string())
    ])
    data = [{
        'a': {
            'aa': [5, 6],
            'ab': True
        },
        'b': 'foo'
    }, {
        'a': {
            'aa': None,
            'ab': False
        },
        'b': None
    }, {
        'a': None,
        'b': 'bar'
    }]
    arr = pa.array(data)
    assert arr.to_pylist() == data

    # Edge cases
    arr = pa.array([{}])
    assert arr.type == pa.struct([])
    assert arr.to_pylist() == [{}]

    # Mixing structs and scalars is rejected
    with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
        pa.array([1, {'a': 2}])
Esempio n. 53
0
def test_empty_table():
    schema = pa.schema([pa.field('oneField', pa.int64())])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema
Esempio n. 54
0
        original_index.update(new_index)
    assert (
        str(e.value) ==
        "Trying to update an index with the wrong column. Got `another_col` but expected `col`"
    )


@pytest.mark.parametrize(
    "dtype",
    [
        pa.binary(),
        pa.bool_(),
        pa.date32(),
        pa.float32(),
        pa.float64(),
        pa.int64(),
        pa.int8(),
        pa.string(),
        pa.timestamp("ns"),
    ],
)
def test_index_empty(store, dtype):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct={},
                                    dtype=dtype,
                                    index_storage_key=storage_key)
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col",
                                    index_storage_key=key1).load(store)
Esempio n. 55
0
def test_safe_cast_nan_to_int_raises():
    arr = pa.array([np.nan, 1.])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        arr.cast(pa.int64(), safe=True)
Esempio n. 56
0
def test_column_selection(tempdir):
    from pyarrow import orc

    # create a table with nested types
    inner = pa.field('inner', pa.int64())
    middle = pa.field('middle', pa.struct([inner]))
    fields = [
        pa.field('basic', pa.int32()),
        pa.field('list', pa.list_(pa.field('item', pa.int32()))),
        pa.field('struct', pa.struct([middle,
                                      pa.field('inner2', pa.int64())])),
        pa.field(
            'list-struct',
            pa.list_(
                pa.field(
                    'item',
                    pa.struct([
                        pa.field('inner1', pa.int64()),
                        pa.field('inner2', pa.int64())
                    ])))),
        pa.field('basic2', pa.int64()),
    ]
    arrs = [[0], [[1, 2]], [{
        "middle": {
            "inner": 3
        },
        "inner2": 4
    }], [[{
        "inner1": 5,
        "inner2": 6
    }, {
        "inner1": 7,
        "inner2": 8
    }]], [9]]
    table = pa.table(arrs, schema=pa.schema(fields))

    path = str(tempdir / 'test.orc')
    orc.write_table(table, path)
    orc_file = orc.ORCFile(path)

    # default selecting all columns
    result1 = orc_file.read()
    assert result1.equals(table)

    # selecting with columns names
    result2 = orc_file.read(columns=["basic", "basic2"])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=["list", "struct", "basic2"])
    assert result3.equals(table.select(["list", "struct", "basic2"]))

    # using dotted paths
    result4 = orc_file.read(columns=["struct.middle.inner"])
    expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]})
    assert result4.equals(expected4)

    result5 = orc_file.read(columns=["struct.inner2"])
    expected5 = pa.table({"struct": [{"inner2": 4}]})
    assert result5.equals(expected5)

    result6 = orc_file.read(
        columns=["list", "struct.middle.inner", "struct.inner2"])
    assert result6.equals(table.select(["list", "struct"]))

    result7 = orc_file.read(columns=["list-struct.inner1"])
    expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]})
    assert result7.equals(expected7)

    # selecting with (Arrow-based) field indices
    result2 = orc_file.read(columns=[0, 4])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=[1, 2, 3])
    assert result3.equals(table.select(["list", "struct", "list-struct"]))

    # error on non-existing name or index
    with pytest.raises(IOError):
        # liborc returns ParseError, which gets translated into IOError
        # instead of ValueError
        orc_file.read(columns=["wrong"])

    with pytest.raises(ValueError):
        orc_file.read(columns=[5])
Esempio n. 57
0
from redvox.common import file_statistics as fs
from redvox.common.parallel_utils import maybe_parallel_map
from redvox.common.station import Station
from redvox.common.errors import RedVoxExceptions

id_py_stct = pa.struct([
    ("id", pa.string()),
    ("uuid", pa.string()),
    ("start_time", pa.float64()),
])
meta_py_stct = pa.struct([
    ("api", pa.float64()),
    ("sub_api", pa.float64()),
    ("make", pa.string()),
    ("model", pa.string()),
    ("os", pa.int64()),
    ("os_version", pa.string()),
    ("app", pa.string()),
    ("app_version", pa.string()),
    ("is_private", pa.bool_()),
    ("packet_duration_s", pa.float64()),
    ("station_description", pa.string()),
])

PERCENT_FREE_MEM_USE = .8  # Percentage of total free memory to use when creating stations (1. is 100%)


class ApiReader:
    """
    Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for
    ease of comparison and use.
    def testIsListLike(self):
        for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())):
            self.assertTrue(arrow_util.is_list_like(t))

        for t in (pa.binary(), pa.int64(), pa.large_string()):
            self.assertFalse(arrow_util.is_list_like(t))
def test_limited_iterator_size_underflow():
    arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
    arr2 = pa.array((0, 1, 2))
    assert arr1.equals(arr2)
Esempio n. 60
0
    b = pa.array([0, 2], type=pa.int64())
    c = pa.array([0, 3], type=pa.int32())
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])


@pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], pa.list_(pa.uint8())),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)