Beispiel #1
0
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
Beispiel #2
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Beispiel #3
0
def test_sequence_nesting_levels():
    data = [1, 2, None]
    arr = pa.array(data)
    assert arr.type == pa.int64()
    assert arr.to_pylist() == data

    data = [[1], [2], None]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data

    data = [[1], [2, 3, 4], [None]]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data

    data = [None, [[None, 1]], [[2, 3, 4], None], [None]]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.list_(pa.int64()))
    assert arr.to_pylist() == data

    exceptions = (pa.ArrowInvalid, pa.ArrowTypeError)

    # Mixed nesting levels are rejected
    with pytest.raises(exceptions):
        pa.array([1, 2, [1]])

    with pytest.raises(exceptions):
        pa.array([1, 2, []])

    with pytest.raises(exceptions):
        pa.array([[1], [2], [None, [1]]])
Beispiel #4
0
def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])
Beispiel #5
0
def test_type_list():
    value_type = pa.int32()
    list_type = pa.list_(value_type)
    assert str(list_type) == 'list<item: int32>'

    field = pa.field('my_item', pa.string())
    l2 = pa.list_(field)
    assert str(l2) == 'list<my_item: string>'
    def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
        m = np.array([True, False, True])

        s = pd.Series(data)
        result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))

        assert pa.Array.from_pandas(expected,
                                    type=pa.list_(t())).equals(result)
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Beispiel #8
0
def test_is_nested_or_struct():
    struct_ex = pa.struct([pa.field('a', pa.int32()),
                           pa.field('b', pa.int8()),
                           pa.field('c', pa.string())])

    assert types.is_struct(struct_ex)
    assert not types.is_struct(pa.list_(pa.int32()))

    assert types.is_nested(struct_ex)
    assert types.is_nested(pa.list_(pa.int32()))
    assert not types.is_nested(pa.int32())
Beispiel #9
0
def dataframe_with_lists(include_index=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Beispiel #10
0
def test_nested_lists(seq):
    data = [[], [1, 2], None]
    arr = pa.array(seq(data))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data
    # With explicit type
    arr = pa.array(seq(data), type=pa.list_(pa.int32()))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int32())
    assert arr.to_pylist() == data
Beispiel #11
0
def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
    # ARROW-2806: numpy.nan is a double value and thus should produce
    # a double array.
    _, pa_type = np_scalar_pa_type
    with pytest.raises(ValueError):
        pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)

    arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True)
    expected = [[None]]
    assert len(arr) == 1
    assert arr.null_count == 0
    assert arr.type == pa.list_(pa_type)
    assert arr.to_pylist() == expected
Beispiel #12
0
def test_list_array_flatten():
    typ2 = pa.list_(
        pa.list_(
            pa.int64()
        )
    )
    arr2 = pa.array([
        None,
        [
            [1, None, 2],
            None,
            [3, 4]
        ],
        [],
        [
            [],
            [5, 6],
            None
        ],
        [
            [7, 8]
        ]
    ])
    assert arr2.type.equals(typ2)

    typ1 = pa.list_(pa.int64())
    arr1 = pa.array([
        [1, None, 2],
        None,
        [3, 4],
        [],
        [5, 6],
        None,
        [7, 8]
    ])
    assert arr1.type.equals(typ1)

    typ0 = pa.int64()
    arr0 = pa.array([
        1, None, 2,
        3, 4,
        5, 6,
        7, 8
    ])
    assert arr0.type.equals(typ0)

    assert arr2.flatten().equals(arr1)
    assert arr1.flatten().equals(arr0)
    assert arr2.flatten().flatten().equals(arr0)
Beispiel #13
0
def test_buffers_nested():
    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
    buffers = a.buffers()
    assert len(buffers) == 4
    # The parent buffers
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    offsets = buffers[1].to_pybytes()
    assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
    # The child buffers
    null_bitmap = buffers[2].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00110111
    values = buffers[3].to_pybytes()
    assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)

    a = pa.array([(42, None), None, (None, 43)],
                 type=pa.struct([pa.field('a', pa.int8()),
                                 pa.field('b', pa.int16())]))
    buffers = a.buffers()
    assert len(buffers) == 5
    # The parent buffer
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    # The child buffers: 'a'
    null_bitmap = buffers[1].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000001
    values = buffers[2].to_pybytes()
    assert struct.unpack('bxx', values) == (42,)
    # The child buffers: 'b'
    null_bitmap = buffers[3].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000100
    values = buffers[4].to_pybytes()
    assert struct.unpack('4xh', values) == (43,)
Beispiel #14
0
 def test_list_of_int(self):
     data = [[1, 2, 3], [], None, [1, 2]]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pyarrow.list_(pyarrow.int64())
     assert arr.to_pylist() == data
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Beispiel #16
0
def test_nested_arrays(seq):
    arr = pa.array(seq([np.array([], dtype=np.int64),
                        np.array([1, 2], dtype=np.int64), None]))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == [[], [1, 2], None]
Beispiel #17
0
def test_nested_ndarray_different_dtypes():
    data = [
        np.array([1, 2, 3], dtype='int64'),
        None,
        np.array([4, 5, 6], dtype='uint32')
    ]

    arr = pa.array(data)
    expected = pa.array([[1, 2, 3], None, [4, 5, 6]],
                        type=pa.list_(pa.int64()))
    assert arr.equals(expected)

    t2 = pa.list_(pa.uint32())
    arr2 = pa.array(data, type=t2)
    expected2 = expected.cast(t2)
    assert arr2.equals(expected2)
Beispiel #18
0
def test_is_union():
    for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]:
        assert types.is_union(pa.union([pa.field('a', pa.int32()),
                                        pa.field('b', pa.int8()),
                                        pa.field('c', pa.string())],
                                       mode=mode))
    assert not types.is_union(pa.list_(pa.int32()))
Beispiel #19
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
    def test_infer_lists(self):
        data = OrderedDict([
            ('nan_ints', [[None, 1], [2, 3]]),
            ('ints', [[0, 1], [2, 3]]),
            ('strs', [[None, u'b'], [u'c', u'd']]),
            ('nested_strs', [[[None, u'b'], [u'c', u'd']], None])
        ])
        df = pd.DataFrame(data)

        expected_schema = pa.schema([
            pa.field('nan_ints', pa.list_(pa.int64())),
            pa.field('ints', pa.list_(pa.int64())),
            pa.field('strs', pa.list_(pa.string())),
            pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
        ])

        self._check_pandas_roundtrip(df, expected_schema=expected_schema)
Beispiel #21
0
def test_schema_from_tuples():
    fields = [
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([('foo', None)])
    def test_list_metadata(self):
        df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
        schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
        table = pa.Table.from_pandas(df, schema=schema)
        metadata = table.schema.metadata
        assert b'mixed' not in metadata[b'pandas']

        js = json.loads(metadata[b'pandas'].decode('utf8'))
        data_column = js['columns'][0]
        assert data_column['pandas_type'] == 'list[int64]'
        assert data_column['numpy_type'] == 'object'
Beispiel #23
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
                         (pa.uint32(), 32),
                         (pa.float16(), 16),
                         (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
Beispiel #24
0
def test_schema_duplicate_fields():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('foo', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'foo']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
foo: list<item: int8>
  child 0, item: int8"""

    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('bar') == fields[1]
    assert sch.field_by_name('xxx') is None
    with pytest.warns(UserWarning):
        assert sch.field_by_name('foo') is None
    def test_infer_numpy_array(self):
        data = OrderedDict([
            ('ints', [
                np.array([0, 1], dtype=np.int64),
                np.array([2, 3], dtype=np.int64)
            ])
        ])
        df = pd.DataFrame(data)
        expected_schema = pa.schema([
            pa.field('ints', pa.list_(pa.int64()))
        ])

        self._check_pandas_roundtrip(df, expected_schema=expected_schema)
Beispiel #26
0
def dataframe_with_lists():
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [0]
    ]
    fields.append(pa.field('double', pa.list_(pa.double())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [0.]
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"]
    ]

    df = pd.DataFrame(arrays)
    schema = pa.Schema.from_fields(fields)

    return df, schema
Beispiel #27
0
def test_schema_equals():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    sch1 = pa.schema(fields)
    sch2 = pa.schema(fields)
    assert sch1.equals(sch2)

    del fields[-1]
    sch3 = pa.schema(fields)
    assert not sch1.equals(sch3)
    def test_nested_lists_all_none(self):
        data = np.array([[None, None], None], dtype=object)

        arr = pa.array(data)
        expected = pa.array(list(data))
        assert arr.equals(expected)
        assert arr.type == pa.list_(pa.null())

        data2 = np.array([None, None, [None, None],
                          np.array([None, None], dtype=object)],
                         dtype=object)
        arr = pa.array(data2)
        expected = pa.array([None, None, [None, None], [None, None]])
        assert arr.equals(expected)
Beispiel #29
0
def test_array_from_py_float32():
    data = [[1.2, 3.4], [9.0, 42.0]]

    t = pa.float32()

    arr1 = pa.array(data[0], type=t)
    arr2 = pa.array(data, type=pa.list_(t))

    expected1 = np.array(data[0], dtype=np.float32)
    expected2 = pd.Series([np.array(data[0], dtype=np.float32),
                           np.array(data[1], dtype=np.float32)])

    assert arr1.type == t
    assert arr1.equals(pa.array(expected1))
    assert arr2.equals(pa.array(expected2))
    def test_column_of_arrays_to_py(self):
        # Test regression in ARROW-1199 not caught in above test
        dtype = 'i1'
        arr = np.array([
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ])
        type_ = pa.list_(pa.int8())
        parr = pa.array(arr, type=type_)

        assert parr[0].as_py() == list(range(10))
        assert parr[1].as_py() == list(range(5))
        assert parr[2].as_py() is None
        assert parr[3].as_py() == [0]
Beispiel #31
0
def test_is_primitive():
    assert types.is_primitive(pa.int32())
    assert not types.is_primitive(pa.list_(pa.int32()))
Beispiel #32
0
     ]),
 ],
 "type_schema":
 OrderedDict([
     ("a", int),
     ("b", float),
     ("c", str),
     ("d", np.ndarray),
     ("e", bytes),
 ]),
 "pyarrow_schema":
 pa.schema([
     ("a", pa.int64()),
     ("b", pa.float64()),
     ("c", pa.string()),
     ("d", pa.list_(pa.int64())),
     ("e", pa.binary()),
 ]) if pa is not None else None,
 "avro_schema": {
     "namespace":
     "example.avro",
     "name":
     "User",
     "type":
     "record",
     "fields": [
         {
             "name": "a",
             "type": "int"
         },
         {
Beispiel #33
0
    buf = s.as_buffer()
    assert isinstance(buf, pa.Buffer)
    assert buf.to_pybytes() == value


def test_fixed_size_binary():
    s = pa.scalar(b'foof', type=pa.binary(4))
    assert isinstance(s, pa.FixedSizeBinaryScalar)
    assert s.as_py() == b'foof'

    with pytest.raises(pa.ArrowInvalid):
        pa.scalar(b'foof5', type=pa.binary(4))


@pytest.mark.parametrize(('ty', 'klass'),
                         [(pa.list_(pa.string()), pa.ListScalar),
                          (pa.large_list(pa.string()), pa.LargeListScalar)])
def test_list(ty, klass):
    v = ['foo', None]
    s = pa.scalar(v, type=ty)
    assert s.type == ty
    assert len(s) == 2
    assert isinstance(s.values, pa.Array)
    assert s.values == v
    assert isinstance(s, klass)
    assert repr(v) in repr(s)
    assert s.as_py() == v
    assert s[0].as_py() == 'foo'
    assert s[1].as_py() is None
    assert s[-1] == s[1]
    assert s[-2] == s[0]
Beispiel #34
0
def test_arrow_list_functions():
    lst = np.array([['a, bc'], ['de'], ['e', 'ee'], ['中文', '中文2']],
                   dtype=object)
    has_na_lst = lst.copy()
    has_na_lst[1] = None

    for pandas_only in [False, True]:
        with option_context({'dataframe.arrow_array.pandas_only':
                             pandas_only}):
            arrow_array = ArrowListArray(lst)
            has_na_arrow_array = ArrowListArray(has_na_lst)

            # getitem, scalar
            assert arrow_array[1] == lst[1]
            assert list(arrow_array[-1]) == lst[-1]
            # getitem, slice
            np.testing.assert_array_equal(arrow_array[:2].to_numpy(), lst[:2])

            # setitem
            arrow_array2 = arrow_array.copy()
            lst2 = lst.copy()
            for s in [['ss'], pd.Series(['ss'])]:
                arrow_array2[0] = s
                lst2[0] = ['ss']
                np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2)
            arrow_array2[0] = None
            lst2[0] = None
            np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2)
            with pytest.raises(ValueError):
                # must set list like object
                arrow_array2[0] = 'ss'

            # test to_numpy
            np.testing.assert_array_equal(arrow_array.to_numpy(), lst)
            np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), lst)
            np.testing.assert_array_equal(
                has_na_arrow_array.to_numpy(na_value=1),
                pd.Series(has_na_lst).fillna(1).to_numpy())

            # test fillna
            if not pandas_only:
                arrow_array3 = has_na_arrow_array.fillna(lst[1])
                np.testing.assert_array_equal(arrow_array3.to_numpy(), lst)

            # test astype
            with pytest.raises(TypeError):
                arrow_array.astype(np.int64)
            with pytest.raises(TypeError):
                arrow_array.astype(ArrowListDtype(np.int64))
            arrow_array4 = ArrowListArray([[1, 2], [3]])
            expected = np.array([['1', '2'], ['3']], dtype=object)
            np.testing.assert_array_equal(
                arrow_array4.astype(ArrowListDtype(str)), expected)
            np.testing.assert_array_equal(
                arrow_array4.astype(ArrowListDtype(arrow_array4.dtype)),
                arrow_array4)
            np.testing.assert_array_equal(
                arrow_array4.astype(ArrowListDtype(arrow_array4.dtype),
                                    copy=False), arrow_array4)

            # test nbytes
            assert arrow_array.nbytes < pd.Series(lst).memory_usage(deep=True)

            # test memory_usage
            if not pandas_only:
                assert arrow_array.memory_usage(
                    deep=True) == arrow_array.nbytes

            # test isna
            np.testing.assert_array_equal(has_na_arrow_array.isna(),
                                          pd.Series(has_na_lst).isna())

            # test take
            assert list(arrow_array.take([1, 2, -1])) == list(
                pd.Series(lst).take([1, 2, -1]))

            # test shift
            assert list(arrow_array.shift(
                2, fill_value=['aa'])) == [['aa']] * 2 + lst[:-2].tolist()

            # test all any
            if _use_bool_any_all:
                assert arrow_array.all() == pd.array(lst).all()
                assert arrow_array.any() == pd.array(lst).any()
            else:
                assert arrow_array.all() == lst.all()
                assert arrow_array.any() == lst.any()

            # test repr
            assert 'ArrowListArray' in repr(arrow_array)

            # test concat empty
            arrow_array5 = ArrowListArray(
                pa.chunked_array([], type=pa.list_(pa.string())))
            concatenated = ArrowListArray._concat_same_type(
                [arrow_array5, arrow_array5])
            if not pandas_only:
                assert len(concatenated._arrow_array.chunks) == 1
            pd.testing.assert_series_equal(pd.Series(arrow_array5),
                                           pd.Series(concatenated))
Beispiel #35
0
all_array_types = [
    ('bool', [True, False, False, True, True]),
    ('uint8', np.arange(5)),
    ('int8', np.arange(5)),
    ('uint16', np.arange(5)),
    ('int16', np.arange(5)),
    ('uint32', np.arange(5)),
    ('int32', np.arange(5)),
    ('uint64', np.arange(5, 10)),
    ('int64', np.arange(5, 10)),
    ('float', np.arange(0, 0.5, 0.1)),
    ('double', np.arange(0, 0.5, 0.1)),
    ('string', ['a', 'b', None, 'ddd', 'ee']),
    ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
    (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
    (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
    (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [{
        'a': 1,
        'b': 2
    }, None, {
        'a': 3,
        'b': 4
    }, None, {
        'a': 5,
        'b': 6
    }]),
]

exported_functions = [
    func for (name, func) in sorted(pc.__dict__.items())
Beispiel #36
0
def test_writing_empty_lists():
    # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
    arr1 = pa.array([[], []], pa.list_(pa.int32()))
    table = pa.Table.from_arrays([arr1], ['list(int32)'])
    _check_roundtrip(table)
Beispiel #37
0
    (np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue),
    (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue),
    (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value),
    (decimal.Decimal("1.1234567890123456789012345678901234567890"), None,
     pa.Decimal256Scalar, pa.Decimal256Value),
    ("string", None, pa.StringScalar, pa.StringValue),
    (b"bytes", None, pa.BinaryScalar, pa.BinaryValue),
    ("largestring", pa.large_string(), pa.LargeStringScalar,
     pa.LargeStringValue),
    (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar,
     pa.LargeBinaryValue),
    (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue),
    ([1, 2, 3], None, pa.ListScalar, pa.ListValue),
    ([1, 2, 3, 4], pa.large_list(
        pa.int8()), pa.LargeListScalar, pa.LargeListValue),
    ([1, 2, 3, 4, 5], pa.list_(
        pa.int8(), 5), pa.FixedSizeListScalar, pa.FixedSizeListValue),
    (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value),
    (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value),
    (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue),
    (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'),
     pa.Time32Scalar, pa.Time32Value),
    (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value),
    (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue),
    ({
        'a': 1,
        'b': [1, 2]
    }, None, pa.StructScalar, pa.StructValue),
    ([('a', 1),
      ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue),
])
def test_basics(value, ty, klass, deprecated):
Beispiel #38
0
def test_recordbatch_from_arrays_validate_schema():
    # ARROW-6263
    arr = pa.array([1, 2])
    schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))])
    with pytest.raises(NotImplementedError):
        pa.record_batch([arr], schema=schema)
Beispiel #39
0
def test_list_type():
    ty = pa.list_(pa.int64())
    assert ty.value_type == pa.int64()
Beispiel #40
0
def test_schema_pyarrow_types():
    field_name = "column1"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "int",
            "bitWidth": 8,
            "isSigned": True
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.int8()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_no_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "timestamp"
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("ns")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_with_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "timestamp",
            "unit": "MICROSECOND"
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("us")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "date_with_day_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "date",
            "unit": "DAY"
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.date32()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_list"
    pyarrow_field = pyarrow_field_from_dict({
        "name":
        field_name,
        "nullable":
        False,
        "metadata":
        metadata,
        "type": {
            "name": "list"
        },
        "children": [{
            "type": {
                "name": "int",
                "bitWidth": 32,
                "isSigned": True
            }
        }],
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.list_(
        pyarrow.field("element", pyarrow.int32()))
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "dictionary"
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "int",
            "bitWidth": 32,
            "isSigned": True
        },
        "children": [],
        "dictionary": {
            "id": 0,
            "indexType": {
                "name": "int",
                "bitWidth": 16,
                "isSigned": True
            },
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "struct_array"
    pyarrow_field = pyarrow_field_from_dict({
        "name": field_name,
        "nullable": False,
        "metadata": metadata,
        "type": {
            "name": "list"
        },
        "children": [],
        "dictionary": {
            "id": 0,
            "indexType": {
                "name": "int",
                "bitWidth": 32,
                "isSigned": True
            },
        },
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(
        pyarrow.int32(),
        pyarrow.list_(
            pyarrow.field(
                "element",
                pyarrow.struct(
                    [pyarrow.field("val", pyarrow.int32(), False, metadata)]),
            )),
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_dictionary"
    pyarrow_field = pyarrow_field_from_dict({
        "name":
        field_name,
        "metadata": {
            "metadata_k": "metadata_v"
        },
        "nullable":
        False,
        "type": {
            "name": "dictionary"
        },
        "dictionary": {
            "indexType": {
                "type": {
                    "name": "int",
                    "bitWidth": 8
                }
            }
        },
        "children": [{
            "type": {
                "name": "int",
                "bitWidth": 32
            }
        }],
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    pyarrow_field = pyarrow_field_from_dict({
        "name":
        field_name,
        "type": {
            "name": "struct"
        },
        "children": [{
            "name": "x",
            "type": {
                "name": "int",
                "bitWidth": 64
            },
            "nullable": True,
            "metadata": {},
        }],
        "metadata": {
            "metadata_k": "metadata_v"
        },
        "nullable":
        False,
    })
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.struct(
        [pyarrow.field("x", pyarrow.int64(), True, {})])
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False
Beispiel #41
0
def test_list_with_non_list(seq):
    # List types don't accept non-sequences
    with pytest.raises(pa.ArrowTypeError):
        pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
Beispiel #42
0
def test_is_null():
    assert types.is_null(pa.null())
    assert not types.is_null(pa.list_(pa.int32()))
Beispiel #43
0
     )),
 type_fails_on_stable_pandas(
     FletcherTestType(
         pa.float64(),
         [2.5, 1.0, -1.0, 0, 66.6] * 20,
         [None, 1.1],
         [2.5, 2.5, None, None, -100.1, -100.1, 2.5, 100.1],
         [2.5, 100.99, -10.1],
         [2.5, None, -10.1],
         lambda: choices([2.5, 1.0, -1.0, 0, 66.6], k=10),
     )),
 # Most of the tests fail as assert_extension_array_equal casts to numpy object
 # arrays and on them equality is not defined.
 pytest.param(
     FletcherTestType(
         pa.list_(pa.string()),
         [["B", "C"], ["A"], [None], ["A", "A"], []],
         [None, ["A"]],
         [["B"], ["B"], None, None, ["A"], ["A"], ["B"], ["C"]],
         [["B"], ["C"], ["A"]],
         [["B"], None, ["A"]],
         lambda: choices([["B", "C"], ["A"], [None], ["A", "A"]], k=10),
     ),
     marks=pytest.mark.xfail,
 ),
 FletcherTestType(
     pa.date64(),
     [
         datetime.date(2015, 1, 1),
         datetime.date(2010, 12, 31),
         datetime.date(1970, 1, 1),
Beispiel #44
0
def test_list_from_numpy():
    s = pa.scalar(np.array([1, 2, 3], dtype=np.int64()))
    assert s.type == pa.list_(pa.int64())
    assert s.as_py() == [1, 2, 3]
Beispiel #45
0
def test_empty_lists_table_roundtrip(use_legacy_dataset):
    # ARROW-2744: Shouldn't crash when writing an array of empty lists
    arr = pa.array([[], []], type=pa.list_(pa.int32()))
    table = pa.Table.from_arrays([arr], ["A"])
    _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
 def __arrow_array__(self, type=None):
     """This function is called when calling pa.array(typed_sequence)"""
     assert type is None, "TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)"
     trying_type = False
     if type is not None:  # user explicitly passed the feature
         pass
     elif type is None and self.try_type:
         type = self.try_type
         trying_type = True
     else:
         type = self.type
     try:
         if isinstance(type, _ArrayXDExtensionType):
             if isinstance(self.data, np.ndarray):
                 storage = numpy_to_pyarrow_listarray(self.data, type=type.value_type)
             else:
                 storage = pa.array(self.data, type.storage_dtype)
             out = pa.ExtensionArray.from_storage(type, storage)
         elif isinstance(self.data, np.ndarray):
             out = numpy_to_pyarrow_listarray(self.data)
         elif isinstance(self.data, list) and self.data and isinstance(self.data[0], np.ndarray):
             out = list_of_np_array_to_pyarrow_listarray(self.data)
         else:
             out = pa.array(cast_to_python_objects(self.data, only_1d_for_numpy=True), type=type)
         if trying_type and out[0].as_py() != self.data[0]:
             raise TypeError(
                 "Specified try_type alters data. Please check that the type/feature that you provided match the type/features of the data."
             )
         if self.optimized_int_type and self.type is None and self.try_type is None:
             if pa.types.is_int64(out.type):
                 out = out.cast(self.optimized_int_type)
             elif pa.types.is_list(out.type):
                 if pa.types.is_int64(out.type.value_type):
                     out = out.cast(pa.list_(self.optimized_int_type))
                 elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
                     out = out.cast(pa.list_(pa.list_(self.optimized_int_type)))
         return out
     except (TypeError, pa.lib.ArrowInvalid) as e:  # handle type errors and overflows
         if trying_type:
             try:  # second chance
                 if isinstance(self.data, np.ndarray):
                     return numpy_to_pyarrow_listarray(self.data, type=None)
                 else:
                     return pa.array(self.data, type=None)
             except pa.lib.ArrowInvalid as e:
                 if "overflow" in str(e):
                     raise OverflowError(
                         "There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format(
                             type_(self.data), e
                         )
                     ) from None
                 else:
                     raise
         elif "overflow" in str(e):
             raise OverflowError(
                 "There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format(
                     type_(self.data), e
                 )
             ) from None
         else:
             raise
Beispiel #47
0
 def __call__(self):
     return pa.struct({
         "language": pa.list_(pa.string()),
         "translation": pa.list_(pa.string())
     })
Beispiel #48
0
 def ArrowSchema(self):
     return pa.schema(
         [pa.field(c, pa.list_(pa.int32())) for c in self._columns])
def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    if not ARROW_LARGER_EQ_0130:
        fields.append(pa.field("__index_level_0__", pa.int64()))
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema
Beispiel #50
0
def test_field_id_metadata():
    # ARROW-7080
    field_id = b'PARQUET:field_id'
    inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
    middle = pa.field('middle',
                      pa.struct([inner]),
                      metadata={field_id: b'101'})
    fields = [
        pa.field('basic',
                 pa.int32(),
                 metadata={
                     b'other': b'abc',
                     field_id: b'1'
                 }),
        pa.field('list',
                 pa.list_(
                     pa.field('list-inner',
                              pa.int32(),
                              metadata={field_id: b'10'})),
                 metadata={field_id: b'11'}),
        pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
        pa.field('no-metadata', pa.int32()),
        pa.field('non-integral-field-id',
                 pa.int32(),
                 metadata={field_id: b'xyz'}),
        pa.field('negative-field-id',
                 pa.int32(),
                 metadata={field_id: b'-1000'})
    ]
    arrs = [[] for _ in fields]
    table = pa.table(arrs, schema=pa.schema(fields))

    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()

    pf = pq.ParquetFile(pa.BufferReader(contents))
    schema = pf.schema_arrow

    assert schema[0].metadata[field_id] == b'1'
    assert schema[0].metadata[b'other'] == b'abc'

    list_field = schema[1]
    assert list_field.metadata[field_id] == b'11'

    list_item_field = list_field.type.value_field
    assert list_item_field.metadata[field_id] == b'10'

    struct_field = schema[2]
    assert struct_field.metadata[field_id] == b'102'

    struct_middle_field = struct_field.type[0]
    assert struct_middle_field.metadata[field_id] == b'101'

    struct_inner_field = struct_middle_field.type[0]
    assert struct_inner_field.metadata[field_id] == b'100'

    assert schema[3].metadata is None
    # Invalid input is passed through (ok) but does not
    # have field_id in parquet (not tested)
    assert schema[4].metadata[field_id] == b'xyz'
    assert schema[5].metadata[field_id] == b'-1000'
Beispiel #51
0
def from_ibis_set(dtype):
    return pa.list_(to_pyarrow_type(dtype.value_type))
Beispiel #52
0
def test_struct_from_dicts_inference():
    expected_type = pa.struct([
        pa.field('a', pa.int64()),
        pa.field('b', pa.string()),
        pa.field('c', pa.bool_())
    ])
    data = [{
        'a': 5,
        'b': u'foo',
        'c': True
    }, {
        'a': 6,
        'b': u'bar',
        'c': False
    }]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}]
    expected = [{
        'a': 5,
        'b': None,
        'c': True
    }, None, {
        'a': None,
        'b': None,
        'c': None
    }, {
        'a': None,
        'b': u'bar',
        'c': None
    }]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == expected

    # Nested
    expected_type = pa.struct([
        pa.field(
            'a',
            pa.struct([
                pa.field('aa', pa.list_(pa.int64())),
                pa.field('ab', pa.bool_())
            ])),
        pa.field('b', pa.string())
    ])
    data = [{
        'a': {
            'aa': [5, 6],
            'ab': True
        },
        'b': 'foo'
    }, {
        'a': {
            'aa': None,
            'ab': False
        },
        'b': None
    }, {
        'a': None,
        'b': 'bar'
    }]
    arr = pa.array(data)
    assert arr.to_pylist() == data

    # Edge cases
    arr = pa.array([{}])
    assert arr.type == pa.struct([])
    assert arr.to_pylist() == [{}]

    # Mixing structs and scalars is rejected
    with pytest.raises(pa.ArrowInvalid):
        pa.array([1, {'a': 2}])
Beispiel #53
0
def test_query_indices_external(store, metadata_version):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "file.parquest"
                }
            },
            "part_2": {
                "files": {
                    "core_data": "file2.parquest"
                }
            },
        },
        "indices": {
            "product_id":
            "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
            "location_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "3": ["part_1"],
                "4": ["part_2"],
            },
        },
    }
    store.put(
        "uuid+namespace-attribute12_underscored.by-dataset-metadata.json",
        simplejson.dumps(expected).encode("utf-8"),
    )
    df = pd.DataFrame({
        "product_id": [1, 2, 100, 34],
        "partition": [
            np.array(["part_1"], dtype=object),
            np.array(["part_2"], dtype=object),
            np.array(["part_1", "part_2"], dtype=object),
            np.array(["part_1"], dtype=object),
        ],
    })
    schema = pa.schema([
        pa.field("partition", pa.list_(pa.string())),
        pa.field("product_id", pa.int64()),
    ])
    table = pa.Table.from_pandas(df, schema=schema)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(
        "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
        buf.getvalue().to_pybytes(),
    )
    store_schema_metadata(
        make_meta(df, origin="core"),
        "uuid+namespace-attribute12_underscored",
        store,
        "core_data",
    )

    dmd = DatasetMetadata.load_from_store(
        "uuid+namespace-attribute12_underscored", store)

    dmd = dmd.load_index("product_id", store)
    assert dmd.query(product_id=2) == ["part_2"]
    dmd = dmd.load_all_indices(store)
    assert dmd.query(product_id=2, location_id=2) == ["part_2"]
    assert dmd.query(product_id=100, location_id=3) == ["part_1"]
    assert dmd.query(product_id=2, location_id=2,
                     something_else="bla") == ["part_2"]

    additional_index = ExplicitSecondaryIndex.from_v2(
        "another_column", {"1": ["part_2", "part_3"]})
    assert dmd.query(indices=[additional_index],
                     another_column="1",
                     product_id=2,
                     location_id=2) == ["part_2"]
Beispiel #54
0
    pa.float32().id:
    float,
    pa.float64().id:
    float,
    pa.date32().id:
    datetime.date,
    pa.date64().id:
    datetime.date,
    pa.timestamp("ms").id:
    datetime.datetime,
    pa.binary().id:
    six.binary_type,
    pa.string().id:
    six.text_type,
    # Use any list type here, only LIST is important
    pa.list_(pa.string()).id:
    list,
}

_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}


class FletcherDtype(ExtensionDtype):
    def __init__(self, arrow_dtype):
        self.arrow_dtype = arrow_dtype

    def __hash__(self):
        return hash(self.arrow_dtype)

    def __str__(self):
        return "fletcher[{}]".format(self.arrow_dtype)
Beispiel #55
0
def test_nested_arrays(seq):
    arr = pa.array(seq([np.array([], dtype=int), np.array([1, 2]), None]))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == [[], [1, 2], None]
Beispiel #56
0
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int8()),
        pa.field('c', pa.string())
    ]),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_DENSE),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_SPARSE),
    # XXX Needs array pickling
    # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
]
Beispiel #57
0
 def arrow_type(self):
     if isinstance(self._value_type, ArrowDtype):
         arrow_subdtype = self._value_type.arrow_type
     else:
         arrow_subdtype = pa.from_numpy_dtype(self._value_type)
     return pa.list_(arrow_subdtype)
Beispiel #58
0
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    array.validate()
    result = pickle.loads(pickle.dumps(array))
    result.validate()
    assert result.equals(array)
Beispiel #59
0
def test_is_list():
    assert types.is_list(pa.list_(pa.int32()))
    assert not types.is_list(pa.int32())
Beispiel #60
0
# limitations under the License.
#

""" A DoFn that coverts a batch of features into an Arrow table."""


import apache_beam as beam
import pyarrow as pa

from typing import Dict, List, Mapping, Union
from tensorflow_metadata.proto.v0 import schema_pb2
from tensorflow_metadata.proto.v0 import statistics_pb2

_ARROW_TYPE_MAP = {
    ColumnType.UNKNOWN: pa.null(),
    ColumnType.INT: pa.list_(pa.int64()),
    ColumnType.FLOAT: pa.list_(pa.float32()),
    ColumnType.STRING: pa.list_(pa.binary()),
}


SimpleFeatureList = List[Union[int, str, float, bool]]
ColumnName = Union[bytes, Text]

@beam.typehints.with_input_types(List[SimpleFeatureList])
@beam.typehints.with_output_types(pa.RecordBatch)
class BatchedFeatureListsToRecordBatch(beam.DoFn):
    """A DoFn to convert a batch of input instances in a feature
    list format to an Arrow table. 
    """