Ejemplo n.º 1
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Ejemplo n.º 2
0
def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])
Ejemplo n.º 3
0
def test_field_metadata():
    f1 = pa.field('a', pa.int8())
    f2 = pa.field('a', pa.int8(), metadata={})
    f3 = pa.field('a', pa.int8(), metadata={b'bizz': b'bazz'})

    assert f1.metadata is None
    assert f2.metadata == {}
    assert f3.metadata[b'bizz'] == b'bazz'
Ejemplo n.º 4
0
def test_field_equality_operators():
    f1 = pa.field('a', pa.int8(), nullable=True)
    f2 = pa.field('a', pa.int8(), nullable=True)
    f3 = pa.field('b', pa.int8(), nullable=True)
    f4 = pa.field('b', pa.int8(), nullable=False)

    assert f1 == f2
    assert f1 != f3
    assert f3 != f4
    assert f1 != 'foo'
Ejemplo n.º 5
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True
Ejemplo n.º 6
0
Archivo: jvm.py Proyecto: rok/arrow
def _from_jvm_int_type(jvm_type):
    """
    Convert a JVM int type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int

    Returns
    -------
    typ: pyarrow.DataType
    """
    if jvm_type.isSigned:
        if jvm_type.bitWidth == 8:
            return pa.int8()
        elif jvm_type.bitWidth == 16:
            return pa.int16()
        elif jvm_type.bitWidth == 32:
            return pa.int32()
        elif jvm_type.bitWidth == 64:
            return pa.int64()
    else:
        if jvm_type.bitWidth == 8:
            return pa.uint8()
        elif jvm_type.bitWidth == 16:
            return pa.uint16()
        elif jvm_type.bitWidth == 32:
            return pa.uint32()
        elif jvm_type.bitWidth == 64:
            return pa.uint64()
Ejemplo n.º 7
0
def test_is_union():
    for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]:
        assert types.is_union(pa.union([pa.field('a', pa.int32()),
                                        pa.field('b', pa.int8()),
                                        pa.field('c', pa.string())],
                                       mode=mode))
    assert not types.is_union(pa.list_(pa.int32()))
Ejemplo n.º 8
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Ejemplo n.º 9
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False
Ejemplo n.º 10
0
def test_struct_array_slice():
    # ARROW-2311: slicing nested arrays needs special care
    ty = pa.struct([pa.field('a', pa.int8()),
                    pa.field('b', pa.float32())])
    arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5},
                                   {'a': 5, 'b': 6.5}]
Ejemplo n.º 11
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Ejemplo n.º 12
0
def test_buffers_nested():
    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
    buffers = a.buffers()
    assert len(buffers) == 4
    # The parent buffers
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    offsets = buffers[1].to_pybytes()
    assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
    # The child buffers
    null_bitmap = buffers[2].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00110111
    values = buffers[3].to_pybytes()
    assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)

    a = pa.array([(42, None), None, (None, 43)],
                 type=pa.struct([pa.field('a', pa.int8()),
                                 pa.field('b', pa.int16())]))
    buffers = a.buffers()
    assert len(buffers) == 5
    # The parent buffer
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    # The child buffers: 'a'
    null_bitmap = buffers[1].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000001
    values = buffers[2].to_pybytes()
    assert struct.unpack('bxx', values) == (42,)
    # The child buffers: 'b'
    null_bitmap = buffers[3].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000100
    values = buffers[4].to_pybytes()
    assert struct.unpack('4xh', values) == (43,)
Ejemplo n.º 13
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Ejemplo n.º 14
0
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
    def test_array_from_pandas_type_cast(self):
        arr = np.arange(10, dtype='int64')

        target_type = pa.int8()

        result = pa.array(arr, type=target_type)
        expected = pa.array(arr.astype('int8'))
        assert result.equals(expected)
Ejemplo n.º 16
0
def test_schema_from_tuples():
    fields = [
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([('foo', None)])
Ejemplo n.º 17
0
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Ejemplo n.º 18
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
                         (pa.uint32(), 32),
                         (pa.float16(), 16),
                         (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
Ejemplo n.º 19
0
def test_is_nested_or_struct():
    struct_ex = pa.struct([pa.field('a', pa.int32()),
                           pa.field('b', pa.int8()),
                           pa.field('c', pa.string())])

    assert types.is_struct(struct_ex)
    assert not types.is_struct(pa.list_(pa.int32()))

    assert types.is_nested(struct_ex)
    assert types.is_nested(pa.list_(pa.int32()))
    assert not types.is_nested(pa.int32())
Ejemplo n.º 20
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Ejemplo n.º 21
0
def test_schema_duplicate_fields():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('foo', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'foo']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
foo: list<item: int8>
  child 0, item: int8"""

    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('bar') == fields[1]
    assert sch.field_by_name('xxx') is None
    with pytest.warns(UserWarning):
        assert sch.field_by_name('foo') is None
Ejemplo n.º 22
0
def test_recordbatch_pickle():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]
    schema = pa.schema([pa.field('ints', pa.int8()),
                        pa.field('floats', pa.float32()),
                        ]).add_metadata({b'foo': b'bar'})
    batch = pa.RecordBatch.from_arrays(data, schema)

    result = pickle.loads(pickle.dumps(batch))
    assert result.equals(batch)
    assert result.schema == schema
Ejemplo n.º 23
0
def test_schema_equals():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    sch1 = pa.schema(fields)
    sch2 = pa.schema(fields)
    assert sch1.equals(sch2)

    del fields[-1]
    sch3 = pa.schema(fields)
    assert not sch1.equals(sch3)
Ejemplo n.º 24
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Ejemplo n.º 25
0
def test_schema_serialization_with_metadata():
    field_metadata = {b'foo': b'bar', b'kind': b'field'}
    schema_metadata = {b'foo': b'bar', b'kind': b'schema'}

    f0 = pa.field('a', pa.int8())
    f1 = pa.field('b', pa.string(), metadata=field_metadata)

    schema = pa.schema([f0, f1], metadata=schema_metadata)

    s_schema = schema.serialize()
    recons_schema = pa.read_schema(s_schema)

    assert recons_schema.equals(schema)
    assert recons_schema.metadata == schema_metadata
    assert recons_schema[0].metadata is None
    assert recons_schema[1].metadata == field_metadata
    def test_column_of_arrays_to_py(self):
        # Test regression in ARROW-1199 not caught in above test
        dtype = 'i1'
        arr = np.array([
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ])
        type_ = pa.list_(pa.int8())
        parr = pa.array(arr, type=type_)

        assert parr[0].as_py() == list(range(10))
        assert parr[1].as_py() == list(range(5))
        assert parr[2].as_py() is None
        assert parr[3].as_py() == [0]
Ejemplo n.º 27
0
def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
Ejemplo n.º 28
0
def test_union_array_slice():
    # ARROW-2314
    arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()),
                                    [pa.array(["a", "b", "c", "d"]),
                                     pa.array([1, 2, 3, 4])])
    assert arr[1:].to_pylist() == ["b", 3, 4]

    binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
    int64 = pa.array([1, 2, 3], type='int64')
    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
    value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32')

    arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64])
    lst = arr.to_pylist()
    for i in range(len(arr)):
        for j in range(i, len(arr)):
            assert arr[i:j].to_pylist() == lst[i:j]
Ejemplo n.º 29
0
def test_is_integer():
    signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
    unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]

    for t in signed_ints + unsigned_ints:
        assert types.is_integer(t)

    for t in signed_ints:
        assert types.is_signed_integer(t)
        assert not types.is_unsigned_integer(t)

    for t in unsigned_ints:
        assert types.is_unsigned_integer(t)
        assert not types.is_signed_integer(t)

    assert not types.is_integer(pa.float32())
    assert not types.is_signed_integer(pa.float32())
Ejemplo n.º 30
0
def test_schema_negative_indexing():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    schema = pa.schema(fields)

    assert schema[-1].equals(schema[2])
    assert schema[-2].equals(schema[1])
    assert schema[-3].equals(schema[0])

    with pytest.raises(IndexError):
        schema[-4]

    with pytest.raises(IndexError):
        schema[3]
def test_sql_types(redshift_table, redshift_con):
    table = redshift_table
    df = get_df()
    df.drop(["binary"], axis=1, inplace=True)
    wr.redshift.to_sql(
        df=df,
        con=redshift_con,
        table=table,
        schema="public",
        mode="overwrite",
        index=True,
        dtype={"iint32": "INTEGER"},
    )
    df = wr.redshift.read_sql_query(f"SELECT * FROM public.{table}", redshift_con)
    ensure_data_types(df, has_list=False)
    dfs = wr.redshift.read_sql_query(
        sql=f"SELECT * FROM public.{table}",
        con=redshift_con,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "ddouble": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
Ejemplo n.º 32
0
def test_union():
    # sparse
    arr = pa.UnionArray.from_sparse(
        pa.array([0, 0, 1, 1], type=pa.int8()),
        [pa.array(["a", "b", "c", "d"]),
         pa.array([1, 2, 3, 4])])
    for s in arr:
        assert isinstance(s, pa.UnionScalar)
        assert s.type.equals(arr.type)
        assert s.is_valid is True
        with pytest.raises(pa.ArrowNotImplementedError):
            pickle.loads(pickle.dumps(s))

    assert arr[0].as_py() == "a"
    assert arr[1].as_py() == "b"
    assert arr[2].as_py() == 3
    assert arr[3].as_py() == 4

    # dense
    arr = pa.UnionArray.from_dense(types=pa.array([0, 1, 0, 0, 1, 1, 0],
                                                  type='int8'),
                                   value_offsets=pa.array(
                                       [0, 0, 2, 1, 1, 2, 3], type='int32'),
                                   children=[
                                       pa.array([b'a', b'b', b'c', b'd'],
                                                type='binary'),
                                       pa.array([1, 2, 3], type='int64')
                                   ])
    for s in arr:
        assert isinstance(s, pa.UnionScalar)
        assert s.type.equals(arr.type)
        assert s.is_valid is True
        with pytest.raises(pa.ArrowNotImplementedError):
            pickle.loads(pickle.dumps(s))

    assert arr[0].as_py() == b'a'
    assert arr[5].as_py() == 3
Ejemplo n.º 33
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([
            pa.field('a', pa.int32()),
            pa.field('b', pa.list_(pa.int8())),
            pa.field('c', pa.string())
        ]),
    ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())],
                 mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())],
                 mode=pa.lib.UnionMode_SPARSE),
    ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Ejemplo n.º 34
0
def test_literals():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    builder.make_literal(True, pa.bool_())
    builder.make_literal(0, pa.uint8())
    builder.make_literal(1, pa.uint16())
    builder.make_literal(2, pa.uint32())
    builder.make_literal(3, pa.uint64())
    builder.make_literal(4, pa.int8())
    builder.make_literal(5, pa.int16())
    builder.make_literal(6, pa.int32())
    builder.make_literal(7, pa.int64())
    builder.make_literal(8.0, pa.float32())
    builder.make_literal(9.0, pa.float64())
    builder.make_literal("hello", pa.string())
    builder.make_literal(b"world", pa.binary())

    builder.make_literal(True, "bool")
    builder.make_literal(0, "uint8")
    builder.make_literal(1, "uint16")
    builder.make_literal(2, "uint32")
    builder.make_literal(3, "uint64")
    builder.make_literal(4, "int8")
    builder.make_literal(5, "int16")
    builder.make_literal(6, "int32")
    builder.make_literal(7, "int64")
    builder.make_literal(8.0, "float32")
    builder.make_literal(9.0, "float64")
    builder.make_literal("hello", "string")
    builder.make_literal(b"world", "binary")

    with pytest.raises(TypeError):
        builder.make_literal("hello", pa.int64())
    with pytest.raises(TypeError):
        builder.make_literal(True, None)
Ejemplo n.º 35
0
def test_dictionary_delta(stream_fixture):
    ty = pa.dictionary(pa.int8(), pa.utf8())
    data = [
        ["foo", "foo", None],
        ["foo", "bar", "foo"],  # potential delta
        ["foo", "bar"],
        ["foo", None, "bar", "quux"],  # potential delta
        ["bar", "quux"],  # replacement
    ]
    batches = [
        pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts'])
        for v in data
    ]
    schema = batches[0].schema

    def write_batches():
        with stream_fixture._get_writer(pa.MockOutputStream(),
                                        schema) as writer:
            for batch in batches:
                writer.write_batch(batch)
            return writer.stats

    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 3
    assert st.num_dictionary_deltas == 0

    stream_fixture.use_legacy_ipc_format = None
    stream_fixture.options = pa.ipc.IpcWriteOptions(
        emit_dictionary_deltas=True)
    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 1
    assert st.num_dictionary_deltas == 2
Ejemplo n.º 36
0
def athena2pyarrow(dtype: str) -> pa.DataType:  # pylint: disable=too-many-return-statements
    """Athena to PyArrow data types conversion."""
    dtype = dtype.lower().replace(" ", "")
    if dtype == "tinyint":
        return pa.int8()
    if dtype == "smallint":
        return pa.int16()
    if dtype in ("int", "integer"):
        return pa.int32()
    if dtype == "bigint":
        return pa.int64()
    if dtype in ("float", "real"):
        return pa.float32()
    if dtype == "double":
        return pa.float64()
    if dtype == "boolean":
        return pa.bool_()
    if (dtype == "string") or dtype.startswith("char") or dtype.startswith("varchar"):
        return pa.string()
    if dtype == "timestamp":
        return pa.timestamp(unit="ns")
    if dtype == "date":
        return pa.date32()
    if dtype in ("binary" or "varbinary"):
        return pa.binary()
    if dtype.startswith("decimal") is True:
        precision, scale = dtype.replace("decimal(", "").replace(")", "").split(sep=",")
        return pa.decimal128(precision=int(precision), scale=int(scale))
    if dtype.startswith("array") is True:
        return pa.list_(value_type=athena2pyarrow(dtype=dtype[6:-1]), list_size=-1)
    if dtype.startswith("struct") is True:
        return pa.struct([(f.split(":", 1)[0], athena2pyarrow(f.split(":", 1)[1])) for f in _split_struct(dtype[7:-1])])
    if dtype.startswith("map") is True:
        parts: List[str] = _split_map(s=dtype[4:-1])
        return pa.map_(athena2pyarrow(parts[0]), athena2pyarrow(parts[1]))
    raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}")
Ejemplo n.º 37
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Ejemplo n.º 38
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([
            pa.field('a', pa.int32()),
            pa.field('b', pa.int8()),
            pa.field('c', pa.string())
        ])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
Ejemplo n.º 39
0
    def test_integer_no_nulls(self):
        data = OrderedDict()
        fields = []

        numpy_dtypes = [
            ('i1', pa.int8()), ('i2', pa.int16()),
            ('i4', pa.int32()), ('i8', pa.int64()),
            ('u1', pa.uint8()), ('u2', pa.uint16()),
            ('u4', pa.uint32()), ('u8', pa.uint64()),
            ('longlong', pa.int64()), ('ulonglong', pa.uint64())
        ]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
                                       min(info.max, np.iinfo(np.int_).max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(pa.field(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = pa.schema(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
Ejemplo n.º 40
0
def test_field_equals():
    meta1 = {b'foo': b'bar'}
    meta2 = {b'bizz': b'bazz'}

    f1 = pa.field('a', pa.int8(), nullable=True)
    f2 = pa.field('a', pa.int8(), nullable=True)
    f3 = pa.field('a', pa.int8(), nullable=False)
    f4 = pa.field('a', pa.int16(), nullable=False)
    f5 = pa.field('b', pa.int16(), nullable=False)
    f6 = pa.field('a', pa.int8(), nullable=True, metadata=meta1)
    f7 = pa.field('a', pa.int8(), nullable=True, metadata=meta1)
    f8 = pa.field('a', pa.int8(), nullable=True, metadata=meta2)

    assert f1.equals(f2)
    assert f6.equals(f7)
    assert not f1.equals(f3)
    assert not f1.equals(f4)
    assert not f3.equals(f4)
    assert not f1.equals(f6)
    assert not f4.equals(f5)
    assert not f7.equals(f8)
Ejemplo n.º 41
0
def test_sql(redshift_table, postgresql_table, mysql_table,
             databases_parameters, db_type):
    if db_type == "postgresql":
        table = postgresql_table
    elif db_type == "mysql":
        table = mysql_table
    else:
        table = redshift_table
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}",
                                   echo=False)
    index = True if engine.name == "redshift" else False
    wr.db.to_sql(
        df=df,
        con=engine,
        name=table,
        schema=databases_parameters[db_type]["schema"],
        if_exists="replace",
        index=index,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=databases_parameters[db_type]["host"],
        port=databases_parameters[db_type]["port"],
        database=databases_parameters[db_type]["database"],
        user=databases_parameters["user"],
        password=databases_parameters["password"],
        echo=False,
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name=table,
            schema=databases_parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = databases_parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table=table,
                                  schema=schema,
                                  index_col="index")
        assert df.shape == (3, 1)
Ejemplo n.º 42
0
def test_argparse_types():
    _map = {}
    csv2parquet.main_with_args(capture_args(_map),
                               ['foo.csv', '--type', '0=string', '0=int8?'])
    assert _map['raw_types'] == [('0', pa.string(), False),
                                 ('0', pa.int8(), True)]
Ejemplo n.º 43
0
import numpy as np

import pyarrow as pa

# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E))

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=0, max_value=38),
                         scale=st.integers(min_value=0, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
Ejemplo n.º 44
0
def test_dictionary_delta(format_fixture):
    ty = pa.dictionary(pa.int8(), pa.utf8())
    data = [
        ["foo", "foo", None],
        ["foo", "bar", "foo"],  # potential delta
        ["foo", "bar"],  # nothing new
        ["foo", None, "bar", "quux"],  # potential delta
        ["bar", "quux"],  # replacement
    ]
    batches = [
        pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts'])
        for v in data
    ]
    batches_delta_only = batches[:4]
    schema = batches[0].schema

    def write_batches(batches, as_table=False):
        with format_fixture._get_writer(pa.MockOutputStream(),
                                        schema) as writer:
            if as_table:
                table = pa.Table.from_batches(batches)
                writer.write_table(table)
            else:
                for batch in batches:
                    writer.write_batch(batch)
            return writer.stats

    if format_fixture.is_file:
        # File format cannot handle replacement
        with pytest.raises(pa.ArrowInvalid):
            write_batches(batches)
        # File format cannot handle delta if emit_deltas
        # is not provided
        with pytest.raises(pa.ArrowInvalid):
            write_batches(batches_delta_only)
    else:
        st = write_batches(batches)
        assert st.num_record_batches == 5
        assert st.num_dictionary_batches == 4
        assert st.num_replaced_dictionaries == 3
        assert st.num_dictionary_deltas == 0

    format_fixture.use_legacy_ipc_format = None
    format_fixture.options = pa.ipc.IpcWriteOptions(
        emit_dictionary_deltas=True)
    if format_fixture.is_file:
        # File format cannot handle replacement
        with pytest.raises(pa.ArrowInvalid):
            write_batches(batches)
    else:
        st = write_batches(batches)
        assert st.num_record_batches == 5
        assert st.num_dictionary_batches == 4
        assert st.num_replaced_dictionaries == 1
        assert st.num_dictionary_deltas == 2

    st = write_batches(batches_delta_only)
    assert st.num_record_batches == 4
    assert st.num_dictionary_batches == 3
    assert st.num_replaced_dictionaries == 0
    assert st.num_dictionary_deltas == 2

    format_fixture.options = pa.ipc.IpcWriteOptions(unify_dictionaries=True)
    st = write_batches(batches, as_table=True)
    assert st.num_record_batches == 5
    if format_fixture.is_file:
        assert st.num_dictionary_batches == 1
        assert st.num_replaced_dictionaries == 0
        assert st.num_dictionary_deltas == 0
    else:
        assert st.num_dictionary_batches == 4
        assert st.num_replaced_dictionaries == 3
        assert st.num_dictionary_deltas == 0
Ejemplo n.º 45
0
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(arbitrary.data,
                            arbitrary.dtype,
                            mask=arbitrary.mask,
                            categories=categories)

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype='int8')
            else:
                sbuf = np.empty(0, dtype='int8')
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype='int32')
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype='int8')

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if pd.api.types.is_categorical_dtype(dtype):
                    data = as_column(pd.Series(arbitrary, dtype='category'),
                                     nan_as_null=nan_as_null)
                else:
                    if dtype is None:
                        np_type = None
                    else:
                        np_type = np.dtype(dtype)
                    data = as_column(np.array(arbitrary, dtype=np_type),
                                     nan_as_null=nan_as_null)

    return data
Ejemplo n.º 46
0
def test_struct_array_slice():
    # ARROW-2311: slicing nested arrays needs special care
    ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())])
    arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
Ejemplo n.º 47
0
    np.arange(10, dtype=np.float64),
    np.arange(10, dtype=np.float32),
    np.arange(10, dtype=np.float16),
])
def test_to_numpy_roundtrip(narr):
    arr = pa.array(narr)
    assert narr.dtype == arr.to_numpy().dtype
    np.testing.assert_array_equal(narr, arr.to_numpy())
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
Ejemplo n.º 48
0
from pyarrow.pandas_compat import _pandas_api  # noqa
import pyarrow as pa

import collections
import datetime
import decimal
import itertools
import math
import traceback

import numpy as np
import pytz


int_type_pairs = [
    (np.int8, pa.int8()),
    (np.int16, pa.int16()),
    (np.int32, pa.int32()),
    (np.int64, pa.int64()),
    (np.uint8, pa.uint8()),
    (np.uint16, pa.uint16()),
    (np.uint32, pa.uint32()),
    (np.uint64, pa.uint64())]


np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst
Ejemplo n.º 49
0
def test_schema_pyarrow_types():
    field_name = "column1"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "int", "bitWidth": 8, "isSigned": True},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.int8()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_no_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "timestamp"},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("ns")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_with_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "timestamp", "unit": "MICROSECOND"},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("us")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "date_with_day_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "date", "unit": "DAY"},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.date32()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_list"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "list"},
            "children": [{"type": {"name": "int", "bitWidth": 32, "isSigned": True}}],
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.list_(
        pyarrow.field("element", pyarrow.int32())
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "dictionary"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "int", "bitWidth": 32, "isSigned": True},
            "children": [],
            "dictionary": {
                "id": 0,
                "indexType": {"name": "int", "bitWidth": 16, "isSigned": True},
            },
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "struct_array"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "list"},
            "children": [],
            "dictionary": {
                "id": 0,
                "indexType": {"name": "int", "bitWidth": 32, "isSigned": True},
            },
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(
        pyarrow.int32(),
        pyarrow.list_(
            pyarrow.field(
                "element",
                pyarrow.struct(
                    [pyarrow.field("val", pyarrow.int32(), False, metadata)]
                ),
            )
        ),
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_dictionary"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "metadata": {"metadata_k": "metadata_v"},
            "nullable": False,
            "type": {"name": "dictionary"},
            "dictionary": {"indexType": {"type": {"name": "int", "bitWidth": 8}}},
            "children": [{"type": {"name": "int", "bitWidth": 32}}],
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "type": {"name": "struct"},
            "children": [
                {
                    "name": "x",
                    "type": {"name": "int", "bitWidth": 64},
                    "nullable": True,
                    "metadata": {},
                }
            ],
            "metadata": {"metadata_k": "metadata_v"},
            "nullable": False,
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.struct(
        [pyarrow.field("x", pyarrow.int64(), True, {})]
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False