Esempio n. 1
0
def test_decimal_array_with_none_and_nan():
    values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
    array = pa.array(values)
    assert array.type == pa.decimal128(4, 3)
    assert array.to_pylist() == values[:2] + [None, None]

    array = pa.array(values, type=pa.decimal128(10, 4))
    assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
Esempio n. 2
0
def test_sequence_decimal_different_precisions():
    data = [
        decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
    ]
    type = pa.decimal128(precision=13, scale=3)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 3
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Esempio n. 4
0
File: jvm.py Progetto: rok/arrow
def field(jvm_field):
    """
    Construct a Field from a org.apache.arrow.vector.types.pojo.Field
    instance.

    Parameters
    ----------
    jvm_field: org.apache.arrow.vector.types.pojo.Field

    Returns
    -------
    pyarrow.Field
    """
    name = jvm_field.getName()
    jvm_type = jvm_field.getType()

    typ = None
    if not jvm_type.isComplex():
        type_str = jvm_type.getTypeID().toString()
        if type_str == 'Null':
            typ = pa.null()
        elif type_str == 'Int':
            typ = _from_jvm_int_type(jvm_type)
        elif type_str == 'FloatingPoint':
            typ = _from_jvm_float_type(jvm_type)
        elif type_str == 'Utf8':
            typ = pa.string()
        elif type_str == 'Binary':
            typ = pa.binary()
        elif type_str == 'FixedSizeBinary':
            typ = pa.binary(jvm_type.getByteWidth())
        elif type_str == 'Bool':
            typ = pa.bool_()
        elif type_str == 'Time':
            typ = _from_jvm_time_type(jvm_type)
        elif type_str == 'Timestamp':
            typ = _from_jvm_timestamp_type(jvm_type)
        elif type_str == 'Date':
            typ = _from_jvm_date_type(jvm_type)
        elif type_str == 'Decimal':
            typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale())
        else:
            raise NotImplementedError(
                "Unsupported JVM type: {}".format(type_str))
    else:
        # TODO: The following JVM types are not implemented:
        #       Struct, List, FixedSizeList, Union, Dictionary
        raise NotImplementedError(
            "JVM field conversion only implemented for primitive types.")

    nullable = jvm_field.isNullable()
    if jvm_field.getMetadata().isEmpty():
        metadata = None
    else:
        metadata = dict(jvm_field.getMetadata())
    return pa.field(name, typ, nullable, metadata)
Esempio n. 5
0
 def test_decimal_64_from_pandas(self):
     expected = pd.DataFrame({
         'decimals': [
             decimal.Decimal('-129934.123331'),
             decimal.Decimal('129534.123731'),
         ]
     })
     converted = pa.Table.from_pandas(expected, preserve_index=False)
     field = pa.field('decimals', pa.decimal128(12, 6))
     schema = pa.schema([field])
     assert converted.schema.equals(schema)
Esempio n. 6
0
 def test_decimal_128_from_pandas(self):
     expected = pd.DataFrame({
         'decimals': [
             decimal.Decimal('394092382910493.12341234678'),
             -decimal.Decimal('314292388910493.12343437128'),
         ]
     })
     converted = pa.Table.from_pandas(expected, preserve_index=False)
     field = pa.field('decimals', pa.decimal128(26, 11))
     schema = pa.schema([field])
     assert converted.schema.equals(schema)
Esempio n. 7
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
                         (pa.uint32(), 32),
                         (pa.float16(), 16),
                         (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
Esempio n. 8
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.union([
            pa.field('a', pa.int8()),
            pa.field('b', pa.int16())
        ], pa.lib.UnionMode_SPARSE),
        pa.union([
            pa.field('a', pa.int8()),
            pa.field('b', pa.int16())
        ], pa.lib.UnionMode_DENSE),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal128(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Esempio n. 9
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Esempio n. 10
0
def test_sequence_decimal_different_precisions():
    data = [decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')]
    type = pa.decimal128(precision=13, scale=3)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 11
0
def test_sequence_decimal_from_integers():
    data = [0, 1, -39402950693754869342983]
    expected = [decimal.Decimal(x) for x in data]
    type = pa.decimal128(precision=28, scale=5)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == expected
Esempio n. 12
0
def test_decimal_overflow():
    pa.decimal128(1, 0)
    pa.decimal128(38, 0)
    for i in (0, -1, 39):
        with pytest.raises(ValueError):
            pa.decimal128(39, 0)
Esempio n. 13
0
def test_is_decimal():
    assert types.is_decimal(pa.decimal128(19, 4))
    assert not types.is_decimal(pa.int32())
Esempio n. 14
0
def test_sequence_decimal_large_integer():
    data = [decimal.Decimal('-394029506937548693.42983'),
            decimal.Decimal('32358695912932.01033')]
    type = pa.decimal128(precision=23, scale=5)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 15
0
     (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
      '"timezone":null}'),
     (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
      '"timezone":null}'),
     (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
      '"timezone":null}'),
     (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
      '"timezone":null}'),
     (pa.timestamp('ns',
                   tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
      ',"timezone":"UTC"}'),
     (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
      '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
     (pa.date32(), '{"name":"date","unit":"DAY"}'),
     (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
     (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'),
     (pa.string(), '{"name":"utf8"}'),
     (pa.binary(), '{"name":"binary"}'),
     (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'),
     # TODO(ARROW-2609): complex types that have children
     # pa.list_(pa.int32()),
     # pa.struct([pa.field('a', pa.int32()),
     #            pa.field('b', pa.int8()),
     #            pa.field('c', pa.string())]),
     # pa.union([pa.field('a', pa.binary(10)),
     #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
     # pa.union([pa.field('a', pa.binary(10)),
     #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
     # TODO: DictionaryType requires a vector in the type
     # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
 ])
Esempio n. 16
0
 def precision(self, value):
     self._validate(value, self.scale)
     self._typ = pa.decimal128(precision=value, scale=self.scale)
Esempio n. 17
0
def generate_test_parquet():
    import pyarrow as pa
    import datetime
    import decimal
    import json
    import pandas as pd
    import pathlib
    import pyarrow.parquet as pq
    import struct

    boolean = pa.array([True, False, None, False, True], type=pa.bool_())
    uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)],
                     type=pa.uint8())
    int8 = pa.array([None if i == 2 else -2 + i for i in range(5)],
                    type=pa.int8())
    uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)],
                      type=pa.uint16())
    int16 = pa.array(
        [None if i == 2 else -20000 + i * 10000 for i in range(5)],
        type=pa.int16())
    uint32 = pa.array(
        [None if i == 2 else 1 + i * 1000000000 for i in range(5)],
        type=pa.uint32())
    int32 = pa.array(
        [None if i == 2 else -2000000000 + i * 1000000000 for i in range(5)],
        type=pa.int32())
    uint64 = pa.array(
        [None if i == 2 else 1 + i * 100000000000 for i in range(5)],
        type=pa.uint64())
    int64 = pa.array([
        None if i == 2 else -200000000000 + i * 100000000000 for i in range(5)
    ],
                     type=pa.int64())
    float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)],
                       type=pa.float32())
    float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)],
                       type=pa.float64())
    string = pa.array(["abcd", "", None, "c", "d"], type=pa.string())
    large_string = pa.array(["abcd", "", None, "c", "d"],
                            type=pa.large_string())
    gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2))
    timestamp_ms_gmt_plus_2 = pa.array([
        pd.Timestamp(year=2019,
                     month=1,
                     day=1,
                     hour=14,
                     nanosecond=500 * 1e6,
                     tz=gmt_plus_2)
    ] * 5,
                                       type=pa.timestamp('ms', tz=gmt_plus_2))
    gmt = datetime.timezone(datetime.timedelta(hours=0))
    timestamp_ms_gmt = pa.array([
        pd.Timestamp(
            year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt)
    ] * 5,
                                type=pa.timestamp('ms', tz=gmt))
    gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25))
    timestamp_ms_gmt_minus_0215 = pa.array([
        pd.Timestamp(year=2019,
                     month=1,
                     day=1,
                     hour=14,
                     nanosecond=500 * 1e6,
                     tz=gmt_minus_0215)
    ] * 5,
                                           type=pa.timestamp(
                                               'ms', tz=gmt_minus_0215))
    timestamp_s_no_tz = pa.array([
        pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6)
    ] * 5,
                                 type=pa.timestamp('s'))
    time32_s = pa.array([3600 + 120 + 3, None, 3, 4, 5], type=pa.time32('s'))
    time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456, 2, 3, 4, 5],
                         type=pa.time32('ms'))
    time64_us = pa.array([(3600 + 120 + 3) * 1e6, None, 3, 4, 5],
                         type=pa.time64('us'))
    time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456, 2, 3, 4, 5],
                         type=pa.time64('ns'))
    date32 = pa.array([1, 2, 3, 4, 5], type=pa.date32())
    date64 = pa.array([86400 * 1000, 2, 3, 4, 5], type=pa.date64())
    duration_s = pa.array([1, 2, 3, 4, 5], type=pa.duration('s'))
    duration_ms = pa.array([1, 2, 3, 4, 5], type=pa.duration('ms'))
    binary = pa.array([b'\x00\x01'] * 5, type=pa.binary())
    large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary())
    fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2))
    decimal128 = pa.array([
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567'), None,
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567')
    ],
                          type=pa.decimal128(7, 3))
    decimal256 = pa.array([
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567'), None,
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567')
    ],
                          type=pa.decimal256(7, 3))
    list_boolean = pa.array([
        None if i == 2 else [
            None if j == 0 else True if (j % 2) == 0 else False
            for j in range(i)
        ] for i in range(5)
    ],
                            type=pa.list_(pa.bool_()))
    list_uint8 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.uint8()))
    list_int8 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                         type=pa.list_(pa.int8()))
    list_uint16 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint16()))
    list_int16 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int16()))
    list_uint32 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint32()))
    list_int32 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int32()))
    list_uint64 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint64()))
    list_int64 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int64()))
    list_float32 = pa.array([
        None if i == 2 else
        [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                            type=pa.list_(pa.float32()))
    list_float64 = pa.array([
        None if i == 2 else
        [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                            type=pa.list_(pa.float64()))
    list_string = pa.array([
        None if i == 2 else [
            "".join(["%c" % (65 + j + k) for k in range(1 + j)])
            for j in range(i)
        ] for i in range(5)
    ])
    fixed_size_list_boolean = pa.array(
        [[True, False], [False, True], [True, False], [False, True],
         [True, False]],
        type=pa.list_(pa.bool_(), 2))
    fixed_size_list_uint8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.uint8(), 2))
    fixed_size_list_int8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                    type=pa.list_(pa.int8(), 2))
    fixed_size_list_uint16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint16(), 2))
    fixed_size_list_int16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int16(), 2))
    fixed_size_list_uint32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint32(), 2))
    fixed_size_list_int32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int32(), 2))
    fixed_size_list_uint64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint64(), 2))
    fixed_size_list_int64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int64(), 2))
    fixed_size_list_float32 = pa.array(
        [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]],
        type=pa.list_(pa.float32(), 2))
    fixed_size_list_float64 = pa.array(
        [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]],
        type=pa.list_(pa.float64(), 2))
    fixed_size_list_string = pa.array(
        [["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]],
        type=pa.list_(pa.string(), 2))
    struct_field = pa.array([{
        "a": 1,
        "b": 2.5,
        "c": {
            "d": "e",
            "f": "g"
        },
        "h": [5, 6],
        "i": 3
    }] * 5)

    #struct_val = { "a": 5 }
    #for i in range(123):
    #    struct_val = { "a": struct_val }
    #struct_field = pa.array([struct_val] * 5)

    map_boolean = pa.array([[('x', None),
                             ('y', True)], [('z', True)], None, [], []],
                           type=pa.map_(pa.string(), pa.bool_()))
    map_uint8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.uint8()))
    map_int8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                        type=pa.map_(pa.string(), pa.int8()))
    map_uint16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint16()))
    map_int16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int16()))
    map_uint32 = pa.array([[('x', 4 * 1000 * 1000 * 1000),
                            ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint32()))
    map_int32 = pa.array([[('x', 2 * 1000 * 1000 * 1000),
                           ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int32()))
    map_uint64 = pa.array([[('x', 4 * 1000 * 1000 * 1000 * 1000),
                            ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint64()))
    map_int64 = pa.array([[('x', -2 * 1000 * 1000 * 1000 * 1000),
                           ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int64()))
    map_float32 = pa.array([[('x', 1.5),
                             ('y', None)], [('z', 3)], None, [], []],
                           type=pa.map_(pa.string(), pa.float32()))
    map_float64 = pa.array([[('x', 1.5),
                             ('y', None)], [('z', 3)], None, [], []],
                           type=pa.map_(pa.string(), pa.float64()))
    map_string = pa.array([[('x', 'x_val'),
                            ('y', None)], [('z', 'z_val')], None, [], []],
                          type=pa.map_(pa.string(), pa.string()))

    indices = pa.array([0, 1, 2, None, 2])
    dictionary = pa.array(['foo', 'bar', 'baz'])
    dict = pa.DictionaryArray.from_arrays(indices, dictionary)

    map_list = pa.array([[('x', []), ('y', [])], [('z', [])], None, [], []],
                        type=pa.map_(pa.string(), pa.list_(pa.uint32())))

    geometry = pa.array([
        None if i == 1 else
        (b'\x01\x01\x00\x00\x00' + struct.pack('<dd', i, 2)) for i in range(5)
    ],
                        type=pa.binary())

    names = [
        "boolean",
        "uint8",
        "int8",
        "uint16",
        "int16",
        "uint32",
        "int32",
        "uint64",
        "int64",
        "float32",
        "float64",
        "string",
        "large_string",
        "timestamp_ms_gmt",
        "timestamp_ms_gmt_plus_2",
        "timestamp_ms_gmt_minus_0215",
        "timestamp_s_no_tz",
        "time32_s",
        "time32_ms",
        "time64_us",
        "time64_ns",
        "date32",
        "date64",
        # "duration_s",
        # "duration_ms",
        "binary",
        "large_binary",
        "fixed_size_binary",
        "decimal128",
        "decimal256",
        "list_boolean",
        "list_uint8",
        "list_int8",
        "list_uint16",
        "list_int16",
        "list_uint32",
        "list_int32",
        "list_uint64",
        "list_int64",
        "list_float32",
        "list_float64",
        "list_string",
        "fixed_size_list_boolean",
        "fixed_size_list_uint8",
        "fixed_size_list_int8",
        "fixed_size_list_uint16",
        "fixed_size_list_int16",
        "fixed_size_list_uint32",
        "fixed_size_list_int32",
        "fixed_size_list_uint64",
        "fixed_size_list_int64",
        "fixed_size_list_float32",
        "fixed_size_list_float64",
        "fixed_size_list_string",
        "struct_field",
        "map_boolean",
        "map_uint8",
        "map_int8",
        "map_uint16",
        "map_int16",
        "map_uint32",
        "map_int32",
        "map_uint64",
        "map_int64",
        "map_float32",
        "map_float64",
        "map_string",
        # "map_list",
        "dict",
        "geometry",
    ]

    locals_ = locals()
    table = pa.table([locals_[x] for x in names], names=names)

    my_schema = table.schema.with_metadata({
        "geo":
        json.dumps({
            "version": "0.1.0",
            "primary_column": "geometry",
            "columns": {
                "geometry": {
                    'crs': wkt_epsg_4326,
                    'bbox': [0, 2, 4, 2],
                    'encoding': 'WKB'
                }
            }
        })
    })

    table = table.cast(my_schema)
    HERE = pathlib.Path(__file__).parent
    pq.write_table(table,
                   HERE / "ogr/data/parquet/test.parquet",
                   compression='NONE',
                   row_group_size=3)
        "FLOAT64",
        pyarrow.time32("ms").id:
        "TIME",
        pyarrow.time64("ns").id:
        "TIME",
        pyarrow.timestamp("ns").id:
        "TIMESTAMP",
        pyarrow.date32().id:
        "DATE",
        pyarrow.date64().id:
        "DATETIME",  # because millisecond resolution
        pyarrow.binary().id:
        "BYTES",
        pyarrow.string().id:
        "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id:
        "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER


def bq_to_arrow_struct_data_type(field):
    arrow_fields = []
    for subfield in field.fields:
        arrow_subfield = bq_to_arrow_field(subfield)
        if arrow_subfield:
Esempio n. 19
0

_supported_pyarrow_types = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32("s"),
    pa.time64("us"),
    pa.date32(),
    pa.timestamp("us"),
    pa.timestamp("us", tz="UTC"),
    pa.timestamp("us", tz="Europe/Paris"),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.large_string(),
    pa.large_binary(),
    pa.list_(pa.int32()),
    pa.list_(pa.int32(), 2),
    pa.large_list(pa.uint16()),
    pa.struct(
        [
            pa.field("a", pa.int32()),
            pa.field("b", pa.int8()),
            pa.field("c", pa.string()),
        ]
    ),
Esempio n. 20
0
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)
Esempio n. 21
0
class TestAbstractFileParserStatics:
    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html
        "input_json_type, output_pyarrow_type",
        [
            ("string", pa.large_string()),
            ("number", pa.float64()),
            ("integer", pa.int64()),
            ("object", pa.large_string()),
            ("array", pa.large_string()),
            ("boolean", pa.bool_()),
            ("null", pa.large_string()),
        ],
    )
    def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None:
        # Json -> PyArrow direction
        LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...")
        assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type

    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html
        "input_pyarrow_types, output_json_type",
        [
            ((pa.null(),), "string"),  # null type
            ((pa.bool_(),), "boolean"),  # boolean type
            (
                (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()),
                "integer",
            ),  # integer types
            ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"),  # number types
            ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"),  # temporal types
            ((pa.binary(), pa.large_binary()), "string"),  # binary types
            ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"),  # string types
            ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"),  # array types
            ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"),  # object types
        ],
    )
    def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None:
        # PyArrow -> Json direction (reverse=True)
        for typ in input_pyarrow_types:
            LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...")
            assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type

    @pytest.mark.parametrize(  # if expecting fail, put pyarrow_schema as None
        "json_schema, pyarrow_schema",
        [
            (
                {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"},
                {
                    "a": pa.large_string(),
                    "b": pa.float64(),
                    "c": pa.int64(),
                    "d": pa.large_string(),
                    "e": pa.large_string(),
                    "f": pa.bool_(),
                    "g": pa.large_string(),
                },
            ),
            ({"single_column": "object"}, {"single_column": pa.large_string()}),
            ({}, {}),
            ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None:
        # Json -> PyArrow direction
        if pyarrow_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(json_schema)
                LOGGER.debug(str(e_info))

    @pytest.mark.parametrize(  # if expecting fail, put json_schema as None
        "pyarrow_schema, json_schema",
        [
            (
                {
                    "a": pa.utf8(),
                    "b": pa.float16(),
                    "c": pa.uint32(),
                    "d": pa.map_(pa.string(), pa.float32()),
                    "e": pa.bool_(),
                    "f": pa.date64(),
                },
                {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"},
            ),
            ({"single_column": pa.int32()}, {"single_column": "integer"}),
            ({}, {}),
            ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None:
        # PyArrow -> Json direction (reverse=True)
        if json_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True)
                LOGGER.debug(str(e_info))
Esempio n. 22
0
def test_sequence_decimal_negative():
    data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
    type = pa.decimal128(precision=10, scale=6)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 23
0
def _parquet_schema(dataframe: pd.DataFrame,
                    custom_redshift_columns: dict = None):
    """ Translates pandas dtypes to PyArrow types and creates a Schema from them

    Args:
        dataframe (pd.DataFrame): Dataframe to pull the schema of
        custom_redshift_columns (dict, Optional): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)

    Returns:
        PyArrow Schema of the given dataframe
        Potentially modified Dataframe
    """
    fields = []
    for col, dtype in dataframe.dtypes.items():
        dtype = dtype.name
        if dtype == 'object':
            if custom_redshift_columns:
                # Detect if the Pandas object column contains Python decimal objects.
                if "[Decimal(" in str(dataframe[col].values)[:9]:
                    # If Python decimal objects are present, parse out the precision and scale
                    # from the custom_redshift_columns dictionary to use when converting
                    # to PyArrow's decimal128 data type.
                    s = custom_redshift_columns[col]
                    precision = int(s[s.find('DECIMAL(') +
                                      len('DECIMAL('):s.rfind(',')].strip())
                    scale = int(s[s.find(',') + len(','):s.rfind(')')].strip())
                    pa_type = pa.decimal128(precision=precision, scale=scale)
                else:
                    pa_type = pa.string()
            else:
                pa_type = pa.string()
        elif dtype.startswith('int32'):
            pa_type = pa.int32()
        elif dtype.startswith('int64'):
            pa_type = pa.int64()
        elif dtype.startswith('int8'):
            pa_type = pa.int8()
        elif dtype.startswith('Int32'):
            dataframe = dataframe.astype({col: 'object'})
            pa_type = pa.int32()
        elif dtype.startswith('Int64'):
            dataframe = dataframe.astype({col: 'object'})
            pa_type = pa.int64()
        elif dtype.startswith('float32'):
            pa_type = pa.float32()
        elif dtype.startswith('float64'):
            pa_type = pa.float64()
        elif dtype.startswith('float16'):
            pa_type = pa.float16()
        elif dtype.startswith('datetime'):
            pa_type = pa.timestamp('ns')
        elif dtype.startswith('date'):
            pa_type = pa.date64()
        elif dtype.startswith('category'):
            pa_type = pa.string()
        elif dtype == 'bool':
            pa_type = pa.bool_()
        else:
            raise NotImplementedError(
                f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq."
            )
        fields.append(pa.field(col, pa_type))

    return (pa.schema(fields=fields), dataframe)
Esempio n. 24
0
def test_from_arrow_max_precision():
    with pytest.raises(ValueError):
        DecimalColumn.from_arrow(
            pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
        )
Esempio n. 25
0
def pyarrow_numeric():
    return pyarrow.decimal128(38, 9)
Esempio n. 26
0
def test_decimal_properties():
    ty = pa.decimal128(19, 4)
    assert ty.byte_width == 16
    assert ty.precision == 19
    assert ty.scale == 4
Esempio n. 27
0
 def test_complex_unload_as_arrow(self, arrow_cursor):
     # NOT_SUPPORTED: Unsupported Hive type: time
     # NOT_SUPPORTED: Unsupported Hive type: json
     table = arrow_cursor.execute("""
         SELECT
           col_boolean
           ,col_tinyint
           ,col_smallint
           ,col_int
           ,col_bigint
           ,col_float
           ,col_double
           ,col_string
           ,col_varchar
           ,col_timestamp
           ,col_date
           ,col_binary
           ,col_array
           ,col_map
           ,col_struct
           ,col_decimal
         FROM one_row_complex
         """).as_arrow()
     assert table.shape[0] == 1
     assert table.shape[1] == 16
     assert table.schema == pa.schema([
         pa.field("col_boolean", pa.bool_()),
         pa.field("col_tinyint", pa.int32()),
         pa.field("col_smallint", pa.int32()),
         pa.field("col_int", pa.int32()),
         pa.field("col_bigint", pa.int64()),
         pa.field("col_float", pa.float32()),
         pa.field("col_double", pa.float64()),
         pa.field("col_string", pa.string()),
         pa.field("col_varchar", pa.string()),
         pa.field("col_timestamp", pa.timestamp("ns")),
         pa.field("col_date", pa.date32()),
         pa.field("col_binary", pa.binary()),
         pa.field("col_array",
                  pa.list_(pa.field("array_element", pa.int32()))),
         pa.field("col_map",
                  pa.map_(pa.int32(), pa.field("entries", pa.int32()))),
         pa.field(
             "col_struct",
             pa.struct(
                 [pa.field("a", pa.int32()),
                  pa.field("b", pa.int32())]),
         ),
         pa.field("col_decimal", pa.decimal128(10, 1)),
     ])
     assert [row for row in zip(*table.to_pydict().values())] == [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         "varchar",
         pd.Timestamp(2017, 1, 1, 0, 0, 0),
         datetime(2017, 1, 2).date(),
         b"123",
         [1, 2],
         [(1, 2), (3, 4)],
         {
             "a": 1,
             "b": 2
         },
         Decimal("0.1"),
     )]
Esempio n. 28
0
def test_sequence_decimal_negative():
    data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
    type = pa.decimal128(precision=10, scale=6)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
@pytest.mark.parametrize(
    argnames="meta_type,arrow_type",
    argvalues=[
        ("bool_", pa.bool_()),
        ("int8", pa.int8()),
        ("int16", pa.int16()),
        ("int32", pa.int32()),
        ("int64", pa.int64()),
        ("uint8", pa.uint8()),
        ("uint16", pa.uint16()),
        ("uint32", pa.uint32()),
        ("uint64", pa.uint64()),
        ("float16", pa.float16()),
        ("float32", pa.float32()),
        ("float64", pa.float64()),
        ("decimal128(38,1)", pa.decimal128(38, 1)),
        ("decimal128(1,2)", pa.decimal128(1, 2)),
        ("time32(s)", pa.time32("s")),
        ("time32(ms)", pa.time32("ms")),
        ("time64(us)", pa.time64("us")),
        ("time64(ns)", pa.time64("ns")),
        ("timestamp(s)", pa.timestamp("s")),
        ("timestamp(ms)", pa.timestamp("ms")),
        ("timestamp(us)", pa.timestamp("us")),
        ("timestamp(ns)", pa.timestamp("ns")),
        ("date32", pa.date32()),
        ("date64", pa.date64()),
        ("string", pa.string()),
        ("large_string", pa.large_string()),
        ("utf8", pa.utf8()),
        ("large_utf8", pa.large_utf8()),
Esempio n. 30
0
def test_decimal_byte_width():
    ty = pa.decimal128(19, 4)
    assert ty.byte_width == 16
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    expected_names = ["my_int", "my_double", "my_date", "my_decimal"]
    expected_types = [
        pa.int64(),
        pa.float64(),
        pa.date64(),
        pa.decimal128(10, 2)
    ]
    assert schema1.names == expected_names

    checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)]
    assert all(checks1)

    # Do schema2 assertions
    expected_names.append("my_timestamp")
    expected_types.append(pa.timestamp("s"))

    assert schema2.names == expected_names

    checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)]
    assert all(checks2)

    # Also check specific type properties
    assert schema2.field("my_decimal").type.precision == 10
    assert schema2.field("my_decimal").type.scale == 2
    assert schema2.field("my_timestamp").type.unit == "s"
Esempio n. 32
0
def test_decimal_properties():
    ty = pa.decimal128(19, 4)
    assert ty.byte_width == 16
    assert ty.precision == 19
    assert ty.scale == 4
Esempio n. 33
0
    (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
    (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
        '"timezone":null}'),
    (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
        '"timezone":null}'),
    (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
        '"timezone":null}'),
    (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
        '"timezone":null}'),
    (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
        ',"timezone":"UTC"}'),
    (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
        '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
    (pa.date32(), '{"name":"date","unit":"DAY"}'),
    (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
    (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'),
    (pa.string(), '{"name":"utf8"}'),
    (pa.binary(), '{"name":"binary"}'),
    (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'),
    # TODO(ARROW-2609): complex types that have children
    # pa.list_(pa.int32()),
    # pa.struct([pa.field('a', pa.int32()),
    #            pa.field('b', pa.int8()),
    #            pa.field('c', pa.string())]),
    # pa.union([pa.field('a', pa.binary(10)),
    #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
    # pa.union([pa.field('a', pa.binary(10)),
    #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
    # TODO: DictionaryType requires a vector in the type
    # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
])
Esempio n. 34
0
def test_sequence_decimal_no_whole_part():
    data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
    type = pa.decimal128(precision=7, scale=7)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 35
0
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)
Esempio n. 36
0
def test_sequence_decimal():
    data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
    type = pa.decimal128(precision=7, scale=3)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 37
0
def test_is_decimal():
    assert types.is_decimal(pa.decimal128(19, 4))
    assert not types.is_decimal(pa.int32())
Esempio n. 38
0
def test_sequence_decimal_no_scale():
    data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
    type = pa.decimal128(precision=10)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 39
0
    def get_type_and_builtins(self, n, type_name):
        """
        Return a `(arrow type, list)` tuple where the arrow type
        corresponds to the given logical *type_name*, and the list
        is a list of *n* random-generated Python objects compatible
        with the arrow type.
        """
        size = None

        if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'):
            kind = type_name
        elif type_name.startswith(('int', 'uint')):
            kind = 'int'
        elif type_name.startswith('float'):
            kind = 'float'
        elif type_name.startswith('struct'):
            kind = 'struct'
        elif type_name == 'binary':
            kind = 'varying binary'
        elif type_name.startswith('binary'):
            kind = 'fixed binary'
            size = int(type_name[6:])
            assert size > 0
        else:
            raise ValueError("unrecognized type %r" % (type_name,))

        if kind in ('int', 'float'):
            ty = getattr(pa, type_name)()
        elif kind == 'bool':
            ty = pa.bool_()
        elif kind == 'decimal':
            ty = pa.decimal128(9, 9)
        elif kind == 'fixed binary':
            ty = pa.binary(size)
        elif kind == 'varying binary':
            ty = pa.binary()
        elif kind in ('ascii', 'unicode'):
            ty = pa.string()
        elif kind == 'int64 list':
            ty = pa.list_(pa.int64())
        elif kind == 'struct':
            ty = pa.struct([pa.field('u', pa.int64()),
                            pa.field('v', pa.float64()),
                            pa.field('w', pa.bool_())])

        factories = {
            'int': self.generate_int_list,
            'float': self.generate_float_list,
            'bool': self.generate_bool_list,
            'decimal': self.generate_decimal_list,
            'fixed binary': partial(self.generate_fixed_binary_list,
                                    size=size),
            'varying binary': partial(self.generate_varying_binary_list,
                                      min_size=3, max_size=40),
            'ascii': partial(self.generate_ascii_string_list,
                             min_size=3, max_size=40),
            'unicode': partial(self.generate_unicode_string_list,
                               min_size=3, max_size=40),
            'int64 list': partial(self.generate_int_list_list,
                                  min_size=0, max_size=20),
            'struct': self.generate_dict_list,
            'struct from tuples': self.generate_tuple_list,
        }
        data = factories[kind](n)
        return ty, data
Esempio n. 40
0
    [
        [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
        [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
        [1],
        [-1],
        [1, 2, 3, 4],
        [42, 1729, 4104],
        [1, 2, None, 4],
        [None, None, None],
        [],
    ],
)
@pytest.mark.parametrize(
    "typ",
    [
        pa.decimal128(precision=4, scale=2),
        pa.decimal128(precision=5, scale=3),
        pa.decimal128(precision=6, scale=4),
    ],
)
def test_round_trip_decimal_column(data, typ):
    pa_arr = pa.array(data, type=typ)
    col = DecimalColumn.from_arrow(pa_arr)
    assert pa_arr.equals(col.to_arrow())


def test_from_arrow_max_precision():
    with pytest.raises(ValueError):
        DecimalColumn.from_arrow(
            pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
        )
Esempio n. 41
0
def test_sequence_decimal_large_integer():
    data = [decimal.Decimal('-394029506937548693.42983'),
            decimal.Decimal('32358695912932.01033')]
    type = pa.decimal128(precision=23, scale=5)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 42
0
    "time": {
        "type": "long",
        "logicalType": "time-micros"
    },
    "timestamp": {
        "type": "long",
        "logicalType": "timestamp-micros"
    },
}
# This dictionary is duplicated in bigquery/google/cloud/bigquery/_pandas_helpers.py
# When modifying it be sure to update it there as well.
BQ_TO_ARROW_TYPES = {
    "int64": pyarrow.int64(),
    "float64": pyarrow.float64(),
    "bool": pyarrow.bool_(),
    "numeric": pyarrow.decimal128(38, 9),
    "string": pyarrow.utf8(),
    "bytes": pyarrow.binary(),
    "date": pyarrow.date32(),  # int32 days since epoch
    "datetime": pyarrow.timestamp("us"),
    "time": pyarrow.time64("us"),
    "timestamp": pyarrow.timestamp("us", tz="UTC"),
}
SCALAR_COLUMNS = [
    {
        "name": "int_col",
        "type": "int64"
    },
    {
        "name": "float_col",
        "type": "float64"
def pyarrow_numeric():
    return pyarrow.decimal128(38, 9)
Esempio n. 44
0
def test_sql(redshift_table, postgresql_table, mysql_table,
             databases_parameters, db_type):
    if db_type == "postgresql":
        table = postgresql_table
    elif db_type == "mysql":
        table = mysql_table
    else:
        table = redshift_table
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}",
                                   echo=False)
    index = True if engine.name == "redshift" else False
    wr.db.to_sql(
        df=df,
        con=engine,
        name=table,
        schema=databases_parameters[db_type]["schema"],
        if_exists="replace",
        index=index,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=databases_parameters[db_type]["host"],
        port=databases_parameters[db_type]["port"],
        database=databases_parameters[db_type]["database"],
        user=databases_parameters["user"],
        password=databases_parameters["password"],
        echo=False,
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name=table,
            schema=databases_parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = databases_parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table=table,
                                  schema=schema,
                                  index_col="index")
        assert df.shape == (3, 1)
Esempio n. 45
0
def test_iterate_over_decimal_chunk():
    random.seed(datetime.datetime.now())
    precision = random.randint(1, 38)
    scale = random.randint(0, precision)
    datatype = None
    if precision <= 2:
        datatype = pyarrow.int8()
    elif precision <= 4:
        datatype = pyarrow.int16()
    elif precision <= 9:
        datatype = pyarrow.int32()
    elif precision <= 19:
        datatype = pyarrow.int64()
    else:
        datatype = pyarrow.decimal128(precision, scale)

    def decimal_generator(_precision, _scale):
        def decimal128_generator(precision, scale):
            data = []
            for _ in range(precision):
                data.append(str(random.randint(0, 9)))

            if scale:
                data.insert(-scale, '.')
            return decimal.Decimal("".join(data))

        def int64_generator(precision):
            data = random.randint(-9223372036854775808, 9223372036854775807)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int32_generator(precision):
            data = random.randint(-2147483648, 2147483637)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int16_generator(precision):
            data = random.randint(-32768, 32767)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        def int8_generator(precision):
            data = random.randint(-128, 127)
            return int(str(data)[:precision if data >= 0 else precision + 1])

        if _precision <= 2:
            return int8_generator(_precision)
        elif _precision <= 4:
            return int16_generator(_precision)
        elif _precision <= 9:
            return int32_generator(_precision)
        elif _precision <= 19:
            return int64_generator(_precision)
        else:
            return decimal128_generator(_precision, _scale)

    def expected_data_transform_decimal(_precision, _scale):
        def expected_data_transform_decimal_impl(data,
                                                 precision=_precision,
                                                 scale=_scale):
            if precision <= 19:
                return decimal.Decimal(data).scaleb(-scale)
            else:
                return data

        return expected_data_transform_decimal_impl

    column_meta = {
        "logicalType": "FIXED",
        "precision": str(precision),
        "scale": str(scale)
    }
    iterate_over_test_chunk([datatype, datatype], [column_meta, column_meta],
                            lambda: decimal_generator(precision, scale),
                            expected_data_transform_decimal(precision, scale))
Esempio n. 46
0
def test_sequence_decimal():
    data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
    type = pa.decimal128(precision=7, scale=3)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 47
0
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)
Esempio n. 48
0
def test_sequence_decimal_no_scale():
    data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
    type = pa.decimal128(precision=10)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 49
0
def test_decimal_overflow():
    pa.decimal128(1, 0)
    pa.decimal128(38, 0)
    for i in (0, -1, 39):
        with pytest.raises(ValueError):
            pa.decimal128(39, 0)
Esempio n. 50
0
def test_sequence_decimal_no_whole_part():
    data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
    type = pa.decimal128(precision=7, scale=7)
    arr = pa.array(data, type=type)
    assert arr.to_pylist() == data
Esempio n. 51
0
    def get_type_and_builtins(self, n, type_name):
        """
        Return a `(arrow type, list)` tuple where the arrow type
        corresponds to the given logical *type_name*, and the list
        is a list of *n* random-generated Python objects compatible
        with the arrow type.
        """
        size = None

        if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'):
            kind = type_name
        elif type_name.startswith(('int', 'uint')):
            kind = 'int'
        elif type_name.startswith('float'):
            kind = 'float'
        elif type_name.startswith('struct'):
            kind = 'struct'
        elif type_name == 'binary':
            kind = 'varying binary'
        elif type_name.startswith('binary'):
            kind = 'fixed binary'
            size = int(type_name[6:])
            assert size > 0
        else:
            raise ValueError("unrecognized type %r" % (type_name, ))

        if kind in ('int', 'float'):
            ty = getattr(pa, type_name)()
        elif kind == 'bool':
            ty = pa.bool_()
        elif kind == 'decimal':
            ty = pa.decimal128(9, 9)
        elif kind == 'fixed binary':
            ty = pa.binary(size)
        elif kind == 'varying binary':
            ty = pa.binary()
        elif kind in ('ascii', 'unicode'):
            ty = pa.string()
        elif kind == 'int64 list':
            ty = pa.list_(pa.int64())
        elif kind == 'struct':
            ty = pa.struct([
                pa.field('u', pa.int64()),
                pa.field('v', pa.float64()),
                pa.field('w', pa.bool_())
            ])

        factories = {
            'int':
            self.generate_int_list,
            'float':
            self.generate_float_list,
            'bool':
            self.generate_bool_list,
            'decimal':
            self.generate_decimal_list,
            'fixed binary':
            partial(self.generate_fixed_binary_list, size=size),
            'varying binary':
            partial(self.generate_varying_binary_list, min_size=3,
                    max_size=40),
            'ascii':
            partial(self.generate_ascii_string_list, min_size=3, max_size=40),
            'unicode':
            partial(self.generate_unicode_string_list, min_size=3,
                    max_size=40),
            'int64 list':
            partial(self.generate_int_list_list, min_size=0, max_size=20),
            'struct':
            self.generate_dict_list,
            'struct from tuples':
            self.generate_tuple_list,
        }
        data = factories[kind](n)
        return ty, data
Esempio n. 52
0
def to_arrow_type(dt: DataType) -> "pa.DataType":
    """ Convert Spark data type to pyarrow type
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == TimestampNTZType:
        arrow_type = pa.timestamp('us', tz=None)
    elif type(dt) == ArrayType:
        if type(dt.elementType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " +
                            str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    elif type(dt) == MapType:
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError(
                "MapType is only supported with pyarrow 2.0.0 and above")
        if type(dt.keyType) in [StructType, TimestampType] or \
                type(dt.valueType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " +
                            str(dt))
        arrow_type = pa.map_(to_arrow_type(dt.keyType),
                             to_arrow_type(dt.valueType))
    elif type(dt) == StructType:
        if any(type(field.dataType) == StructType for field in dt):
            raise TypeError(
                "Nested StructType not supported in conversion to Arrow")
        fields = [
            pa.field(field.name,
                     to_arrow_type(field.dataType),
                     nullable=field.nullable) for field in dt
        ]
        arrow_type = pa.struct(fields)
    elif type(dt) == NullType:
        arrow_type = pa.null()
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type
Esempio n. 53
0

MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([pa.field('a', pa.int32()),
               pa.field('b', pa.int8()),
               pa.field('c', pa.string())]),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
    # XXX Needs array pickling
    # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
]