Beispiel #1
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
def test_sequence_timestamp_from_int_with_unit():
    data = [1]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"

    with pytest.raises(pa.ArrowException):
        class CustomClass():
            pass
        pa.array([1, CustomClass()], type=ns)
        pa.array([1, CustomClass()], type=pa.date32())
        pa.array([1, CustomClass()], type=pa.date64())
Beispiel #3
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Beispiel #5
0
def test_cast_date64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.date64())
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')

    assert result.equals(expected)
 def test_date(self):
     data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
             datetime.date(2040, 2, 26)]
     arr = pa.from_pylist(data)
     assert len(arr) == 4
     assert arr.type == pa.date64()
     assert arr.null_count == 1
     assert arr[0].as_py() == datetime.date(2000, 1, 1)
     assert arr[1].as_py() is None
     assert arr[2].as_py() == datetime.date(1970, 1, 1)
     assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_sequence_date():
    data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
            datetime.date(2040, 2, 26)]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.date64()
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.date(2000, 1, 1)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.date(1970, 1, 1)
    assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
    def test_dates_from_integers(self):
        t1 = pa.date32()
        t2 = pa.date64()

        arr = np.array([17259, 17260, 17261], dtype='int32')
        arr2 = arr.astype('int64') * 86400000

        a1 = pa.array(arr, type=t1)
        a2 = pa.array(arr2, type=t2)

        expected = date(2017, 4, 3)
        assert a1[0].as_py() == expected
        assert a2[0].as_py() == expected
 def test_date(self):
     df = pd.DataFrame({
         'date': [datetime.date(2000, 1, 1),
                  None,
                  datetime.date(1970, 1, 1),
                  datetime.date(2040, 2, 26)]})
     table = A.Table.from_pandas(df)
     field = A.Field.from_py('date', A.date64())
     schema = A.Schema.from_fields([field])
     assert table.schema.equals(schema)
     result = table.to_pandas()
     expected = df.copy()
     expected['date'] = pd.to_datetime(df['date'])
     tm.assert_frame_equal(result, expected)
Beispiel #11
0
Datei: jvm.py Projekt: rok/arrow
def _from_jvm_date_type(jvm_type):
    """
    Convert a JVM date type to its Python equivalent

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Date

    Returns
    -------
    typ: pyarrow.DataType
    """
    day_unit = jvm_type.getUnit().toString()
    if day_unit == 'DAY':
        return pa.date32()
    elif day_unit == 'MILLISECOND':
        return pa.date64()
Beispiel #12
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
    def test_date_objects_typed(self):
        arr = np.array([
            date(2017, 4, 3),
            None,
            date(2017, 4, 4),
            date(2017, 4, 5)], dtype=object)

        arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
        arr_i8 = arr_i4.astype('int64') * 86400000
        mask = np.array([False, True, False, False])

        t32 = pa.date32()
        t64 = pa.date64()

        a32 = pa.array(arr, type=t32)
        a64 = pa.array(arr, type=t64)

        a32_expected = pa.array(arr_i4, mask=mask, type=t32)
        a64_expected = pa.array(arr_i8, mask=mask, type=t64)

        assert a32.equals(a32_expected)
        assert a64.equals(a64_expected)

        # Test converting back to pandas
        colnames = ['date32', 'date64']
        table = pa.Table.from_arrays([a32, a64], colnames)
        table_pandas = table.to_pandas()

        ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
                               '2017-04-05'],
                              dtype='datetime64[D]')
                     .astype('datetime64[ns]'))
        ex_values[1] = pd.NaT.value
        expected_pandas = pd.DataFrame({'date32': ex_values,
                                        'date64': ex_values},
                                       columns=colnames)
        tm.assert_frame_equal(table_pandas, expected_pandas)
Beispiel #14
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())
Beispiel #15
0
     [
         ("i", pa.int16()),
         ("my_bool", pa.bool_()),
         ("my_nullable_bool", pa.bool_()),
         ("my_date", pa.date32()),
         ("my_datetime", pa.timestamp("ms")),
         ("my_int", pa.uint16()),
         ("my_string", pa.string()),
     ]
 ),
 pa.schema(
     [
         ("i", pa.int32()),
         ("my_bool", pa.bool_()),
         ("my_nullable_bool", pa.bool_()),
         ("my_date", pa.date64()),
         ("my_datetime", pa.timestamp("us")),
         ("my_int", pa.uint32()),
         ("my_string", pa.string()),
     ]
 ),
 pa.schema(
     [
         ("i", pa.int64()),
         ("my_bool", pa.bool_()),
         ("my_nullable_bool", pa.bool_()),
         ("my_date", pa.date64()),
         ("my_datetime", pa.timestamp("ns")),
         ("my_int", pa.uint64()),
         ("my_string", pa.string()),
     ]
Beispiel #16
0
 (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
     '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
     '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
     ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
     '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
 (pa.date32(), '{"name":"date","unit":"DAY"}'),
 (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
 (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'),
 (pa.string(), '{"name":"utf8"}'),
 (pa.binary(), '{"name":"binary"}'),
 (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'),
 # TODO(ARROW-2609): complex types that have children
 # pa.list_(pa.int32()),
 # pa.struct([pa.field('a', pa.int32()),
 #            pa.field('b', pa.int8()),
 #            pa.field('c', pa.string())]),
 # pa.union([pa.field('a', pa.binary(10)),
 #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
 # pa.union([pa.field('a', pa.binary(10)),
 #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
 # TODO: DictionaryType requires a vector in the type
 # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Beispiel #18
0
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected

Beispiel #19
0
    int,
    pa.uint32().id:
    int,
    pa.int64().id:
    int,
    pa.uint64().id:
    int,
    pa.float16().id:
    float,
    pa.float32().id:
    float,
    pa.float64().id:
    float,
    pa.date32().id:
    datetime.date,
    pa.date64().id:
    datetime.date,
    pa.timestamp("ms").id:
    datetime.datetime,
    pa.binary().id:
    six.binary_type,
    pa.string().id:
    six.text_type,
    # Use any list type here, only LIST is important
    pa.list_(pa.string()).id:
    list,
}

_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}

Beispiel #20
0
floating_types = st.sampled_from([
    pa.float16(),
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(
Beispiel #21
0
import pyarrow as pa

schema_fields = [
    pa.field("timestamp", pa.date64(), False),
    pa.field("timezone", pa.uint64(), False).with_metadata({
        "illex_MIN": "0",
        "illex_MAX": "1024"
    }),
    pa.field("vin", pa.uint64(), False),
    pa.field("odometer", pa.uint64(), False).with_metadata({
        "illex_MIN": "0",
        "illex_MAX": "1000"
    }),
    pa.field("hypermiling", pa.bool_(), False),
    pa.field("avgspeed", pa.uint64(), False).with_metadata({
        "illex_MIN": "0",
        "illex_MAX": "200"
    }),
    pa.field(
        "sec_in_band",
        pa.list_(
            pa.field("item", pa.uint64(), False).with_metadata({
                "illex_MIN":
                "0",
                "illex_MAX":
                "4192"
            }), 12), False),
    pa.field(
        "miles_in_time_range",
        pa.list_(
            pa.field("item", pa.uint64(), False).with_metadata({
Beispiel #22
0
def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None,
                       [],
                       np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                                dtype=np.int64)[::2]]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    date_data = [[], [date(2018, 1, 1), date(2032, 12,
                                             30)], [date(2000, 6, 7)], None,
                 [date(1969, 6, 9), date(1972, 7, 3)]]
    time_data = [[time(23, 11, 11),
                  time(1, 2, 3),
                  time(23, 59, 59)], [], [time(22, 5, 59)], None,
                 [time(0, 0, 0), time(18, 0, 2),
                  time(12, 7, 3)]]

    temporal_pairs = [(pa.date32(), date_data), (pa.date64(), date_data),
                      (pa.time32('s'), time_data),
                      (pa.time32('ms'), time_data),
                      (pa.time64('us'), time_data)]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Beispiel #23
0
            return self.storage_type.num_fields

    pyarrow.register_extension_type(
        AwkwardArrowType(pyarrow.null(), None, None, None, None, None, None))

    # order is important; _string_like[:2] vs _string_like[::2]
    _string_like = (
        pyarrow.string(),
        pyarrow.large_string(),
        pyarrow.binary(),
        pyarrow.large_binary(),
    )

    _pyarrow_to_numpy_dtype = {
        pyarrow.date32(): (True, np.dtype("M8[D]")),
        pyarrow.date64(): (False, np.dtype("M8[ms]")),
        pyarrow.time32("s"): (True, np.dtype("M8[s]")),
        pyarrow.time32("ms"): (True, np.dtype("M8[ms]")),
        pyarrow.time64("us"): (False, np.dtype("M8[us]")),
        pyarrow.time64("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.timestamp("s"): (False, np.dtype("M8[s]")),
        pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")),
        pyarrow.timestamp("us"): (False, np.dtype("M8[us]")),
        pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.duration("s"): (False, np.dtype("m8[s]")),
        pyarrow.duration("ms"): (False, np.dtype("m8[ms]")),
        pyarrow.duration("us"): (False, np.dtype("m8[us]")),
        pyarrow.duration("ns"): (False, np.dtype("m8[ns]")),
    }

if not ak._v2._util.numpy_at_least("1.17.0"):
Beispiel #24
0
    def textfsm_data(self, raw_input, fsm_template, schema, data):
        """Convert unstructured output to structured output"""

        records = []
        fsm_template.Reset()
        res = fsm_template.ParseText(raw_input)

        for entry in res:
            metent = dict(zip(fsm_template.header, entry))
            records.append(metent)

        result = self.clean_data(records, data)

        fields = [fld.name for fld in schema]

        ptype_map = {
            pa.string(): str,
            pa.int32(): int,
            pa.int64(): int,
            pa.float32(): float,
            pa.float64(): float,
            pa.date64(): float,
            pa.list_(pa.string()): list,
            pa.list_(pa.int64()): list,
            pa.bool_(): bool,
            pa.list_(pa.struct([('nexthop', pa.string()),
                                ('oif', pa.string()),
                                ('weight', pa.int32())])): list,
        }

        map_defaults = {
            pa.string(): "",
            pa.int32(): 0,
            pa.int64(): 0,
            pa.float32(): 0.0,
            pa.float64(): 0.0,
            pa.date64(): 0.0,
            pa.bool_(): False,
            pa.list_(pa.string()): [],
            pa.list_(pa.int64()): [],
            pa.list_(pa.struct([('nexthop', pa.string()),
                                ('oif', pa.string()),
                                ('weight', pa.int32())])): [("", "", 1)]
        }

        # Ensure the type is set correctly.
        for entry in result:
            for cent in entry:
                if cent in fields:
                    schent_type = schema.field(cent).type
                    if not isinstance(entry[cent], ptype_map[schent_type]):
                        if entry[cent]:
                            entry[cent] = ptype_map[schent_type](entry[cent])
                        else:
                            entry[cent] = map_defaults[schent_type]
                    elif isinstance(entry[cent], list):
                        for i, ele in enumerate(entry[cent]):
                            if not isinstance(ele, ptype_map[schent_type.value_type]):
                                try:
                                    if ptype_map[schent_type.value_type] == int:
                                        entry[cent][i] = int(entry[cent][i])
                                    else:
                                        raise ValueError
                                except ValueError:
                                    entry[cent][i] = (
                                        map_defaults[schent_type.value_type])

        return result
Beispiel #25
0
def test_date_time_types(tempdir):
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.6')

    t0 = pa.timestamp('ms')
    data0 = np.arange(4, dtype='int64')
    a0 = pa.array(data0, type=t0)

    t1 = pa.timestamp('us')
    data1 = np.arange(4, dtype='int64')
    a1 = pa.array(data1, type=t1)

    t2 = pa.timestamp('ns')
    data2 = np.arange(4, dtype='int64')
    a2 = pa.array(data2, type=t2)

    table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]'])
    expected = pa.Table.from_arrays([a0, a1, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int64 for all timestamps supported by default
    filename = tempdir / 'int64_timestamps.parquet'
    _write_table(table, filename, version='2.6')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT64'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    t0_ns = pa.timestamp('ns')
    data0_ns = np.array(data0 * 1000000, dtype='int64')
    a0_ns = pa.array(data0_ns, type=t0_ns)

    t1_ns = pa.timestamp('ns')
    data1_ns = np.array(data1 * 1000, dtype='int64')
    a1_ns = pa.array(data1_ns, type=t1_ns)

    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int96 nanosecond timestamps produced upon request
    filename = tempdir / 'explicit_int96_timestamps.parquet'
    _write_table(table,
                 filename,
                 version='2.6',
                 use_deprecated_int96_timestamps=True)
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    # int96 nanosecond timestamps implied by flavor 'spark'
    filename = tempdir / 'spark_int96_timestamps.parquet'
    _write_table(table, filename, version='2.6', flavor='spark')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)
Beispiel #26
0
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected

Beispiel #27
0
_python_type_map = {
    pa.null().id: six.text_type,
    pa.bool_().id: bool,
    pa.int8().id: int,
    pa.uint8().id: int,
    pa.int16().id: int,
    pa.uint16().id: int,
    pa.int32().id: int,
    pa.uint32().id: int,
    pa.int64().id: int,
    pa.uint64().id: int,
    pa.float16().id: float,
    pa.float32().id: float,
    pa.float64().id: float,
    pa.date32().id: datetime.date,
    pa.date64().id: datetime.date,
    pa.timestamp("ms").id: datetime.datetime,
    pa.binary().id: six.binary_type,
    pa.string().id: six.text_type,
    # Use any list type here, only LIST is important
    pa.list_(pa.string()).id: list,
}

_string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}


class FletcherDtype(ExtensionDtype):
    # na_value = pa.Null()

    def __init__(self, arrow_dtype):
        self.arrow_dtype = arrow_dtype
Beispiel #28
0
    pa.int8(): dt.Int8,
    pa.int16(): dt.Int16,
    pa.int32(): dt.Int32,
    pa.int64(): dt.Int64,
    pa.uint8(): dt.UInt8,
    pa.uint16(): dt.UInt16,
    pa.uint32(): dt.UInt32,
    pa.uint64(): dt.UInt64,
    pa.float16(): dt.Float16,
    pa.float32(): dt.Float32,
    pa.float64(): dt.Float64,
    pa.string(): dt.String,
    pa.binary(): dt.Binary,
    pa.bool_(): dt.Boolean,
    pa.date32(): dt.Date,
    pa.date64(): dt.Date,
}


@dt.dtype.register(pa.DataType)  # type: ignore[misc]
def from_pyarrow_primitive(
    arrow_type: pa.DataType,
    nullable: bool = True,
) -> dt.DataType:
    return _to_ibis_dtypes[arrow_type](nullable=nullable)


@dt.dtype.register(pa.Time32Type)  # type: ignore[misc]
@dt.dtype.register(pa.Time64Type)  # type: ignore[misc]
def from_pyarrow_time(
    arrow_type: pa.TimestampType,

def test_simple_type_construction():
    result = pa.lib.TimestampType()
    with pytest.raises(TypeError):
        str(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)
    arr2 = pa.array(arr)  # noqa
    assert sys.getrefcount(arr) == (refcount + 1)
        pyarrow.int8().id: "INT64",
        pyarrow.int16().id: "INT64",
        pyarrow.int32().id: "INT64",
        pyarrow.int64().id: "INT64",
        pyarrow.uint8().id: "INT64",
        pyarrow.uint16().id: "INT64",
        pyarrow.uint32().id: "INT64",
        pyarrow.uint64().id: "INT64",
        pyarrow.float16().id: "FLOAT64",
        pyarrow.float32().id: "FLOAT64",
        pyarrow.float64().id: "FLOAT64",
        pyarrow.time32("ms").id: "TIME",
        pyarrow.time64("ns").id: "TIME",
        pyarrow.timestamp("ns").id: "TIMESTAMP",
        pyarrow.date32().id: "DATE",
        pyarrow.date64().id: "DATETIME",  # because millisecond resolution
        pyarrow.binary().id: "BYTES",
        pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
        # The exact scale and precision don't matter, see below.
        pyarrow.decimal128(38, scale=9).id: "NUMERIC",
    }

    if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
        BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal256 instances.
        ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
        _BIGNUMERIC_SUPPORT = True
    else:
        _BIGNUMERIC_SUPPORT = False
        datetime.date(2000, 1, 1), None,
        datetime.date(1970, 1, 1),
        datetime.date(2040, 2, 26)
    ]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.date32()
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.date(2000, 1, 1)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.date(1970, 1, 1)
    assert arr[3].as_py() == datetime.date(2040, 2, 26)


@pytest.mark.parametrize('input', [(pa.date32(), [10957, None]),
                                   (pa.date64(), [10957 * 86400000, None])])
def test_sequence_explicit_types(input):
    t, ex_values = input
    data = [datetime.date(2000, 1, 1), None]
    arr = pa.array(data, type=t)
    arr2 = pa.array(ex_values, type=t)

    for x in [arr, arr2]:
        assert len(x) == 2
        assert x.type == t
        assert x.null_count == 1
        assert x[0].as_py() == datetime.date(2000, 1, 1)
        assert x[1] is pa.NA


def test_date32_overflow():
Beispiel #32
0
_NA_REP = "<NA>"
_np_pa_dtypes = {
    np.float64: pa.float64(),
    np.float32: pa.float32(),
    np.int64: pa.int64(),
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
Beispiel #33
0
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
  '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
  '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
  '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
  '"timezone":null}'),
 (pa.timestamp('ns',
               tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
  ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
  '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
 (pa.date32(), '{"name":"date","unit":"DAY"}'),
 (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
 (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'),
 (pa.string(), '{"name":"utf8"}'),
 (pa.binary(), '{"name":"binary"}'),
 (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'),
 # TODO(ARROW-2609): complex types that have children
 # pa.list_(pa.int32()),
 # pa.struct([pa.field('a', pa.int32()),
 #            pa.field('b', pa.int8()),
 #            pa.field('c', pa.string())]),
 # pa.union([pa.field('a', pa.binary(10)),
 #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
 # pa.union([pa.field('a', pa.binary(10)),
 #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
 # TODO: DictionaryType requires a vector in the type
 # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
def test_basics(fletcher_array):
    df = pd.DataFrame(
        {
            "null": fletcher_array(pa.array([None, None], type=pa.null())),
            "bool": fletcher_array(pa.array([None, True], type=pa.bool_())),
            "int8": fletcher_array(pa.array([None, -1], type=pa.int8())),
            "uint8": fletcher_array(pa.array([None, 1], type=pa.uint8())),
            "int16": fletcher_array(pa.array([None, -1], type=pa.int16())),
            "uint16": fletcher_array(pa.array([None, 1], type=pa.uint16())),
            "int32": fletcher_array(pa.array([None, -1], type=pa.int32())),
            "uint32": fletcher_array(pa.array([None, 1], type=pa.uint32())),
            "int64": fletcher_array(pa.array([None, -1], type=pa.int64())),
            "uint64": fletcher_array(pa.array([None, 1], type=pa.uint64())),
            "float16": fletcher_array(
                pa.array([None, np.float16(-0.1)], type=pa.float16())
            ),
            "float32": fletcher_array(pa.array([None, -0.1], type=pa.float32())),
            "float64": fletcher_array(pa.array([None, -0.1], type=pa.float64())),
            "date32": fletcher_array(
                pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32())
            ),
            "date64": fletcher_array(
                pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64())
            ),
            # https://github.com/pandas-dev/pandas/issues/34986
            # "timestamp[s]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8)],
            #         type=pa.timestamp("s"),
            #     )
            # ),
            # "timestamp[ms]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)],
            #         type=pa.timestamp("ms"),
            #     )
            # ),
            # "timestamp[us]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)],
            #         type=pa.timestamp("us"),
            #     )
            # ),
            # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds
            # 'timestamp[ns]': fletcher_array(pa.array([None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns"))),
            "binary": fletcher_array(pa.array([None, b"122"], type=pa.binary())),
            "string": fletcher_array(pa.array([None, "🤔"], type=pa.string())),
            "duration[s]": fletcher_array(
                pa.array([None, datetime.timedelta(seconds=9)], type=pa.duration("s"))
            ),
            "duration[ms]": fletcher_array(
                pa.array(
                    [None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms")
                )
            ),
            "duration[us]": fletcher_array(
                pa.array(
                    [None, datetime.timedelta(microseconds=7)], type=pa.duration("us")
                )
            ),
            # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds
            # 'duration[ns]': fletcher_array(pa.array([None, datetime.timedelta(microseconds=7)], type=pa.duration("ns"))),
            "list[string]": fletcher_array(
                pa.array([None, [None, "🤔"]], type=pa.list_(pa.string()))
            ),
        }
    )
    ddf = dd.from_pandas(df, npartitions=2)

    meta_nonempty = ddf._meta_nonempty
    pdt.assert_frame_equal(meta_nonempty, df)

    result = ddf.compute()
    pdt.assert_frame_equal(result, df)
Beispiel #35
0
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from . import numerical, categorical, datetime
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        if not isinstance(arbitrary, TypedColumnBase):
            # interpret as numeric
            data = arbitrary.view(numerical.NumericalColumn,
                                  dtype=arbitrary.dtype)
        else:
            data = arbitrary

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            raise NotImplementedError("Strings are not yet supported")
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            warnings.warn("Strings are not yet supported, so converting to "
                          "categorical")
            data = as_column(arbitrary.dictionary_encode())
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    arbitrary = utils.scalar_broadcast_to(np.nan,
                                                          (len(arbitrary), ),
                                                          dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        data = as_column(pd.Series(arbitrary,
                                                   dtype='category'),
                                         nan_as_null=nan_as_null)
                    else:
                        np_type = np.dtype(dtype)
                        data = as_column(np.array(arbitrary, dtype=np_type),
                                         nan_as_null=nan_as_null)

    return data
    metadata={"type": "TradeTick"},
)

TYPE_TO_SCHEMA[BettingInstrument] = pa.schema(
    {
        "venue": pa.string(),
        "currency": pa.string(),
        "instrument_id": pa.string(),
        "event_type_id": pa.string(),
        "event_type_name": pa.string(),
        "competition_id": pa.string(),
        "competition_name": pa.string(),
        "event_id": pa.string(),
        "event_name": pa.string(),
        "event_country_code": pa.string(),
        "event_open_date": pa.date64(),
        "betting_type": pa.string(),
        "market_id": pa.string(),
        "market_name": pa.string(),
        "market_start_time": pa.date64(),
        "market_type": pa.string(),
        "selection_id": pa.string(),
        "selection_name": pa.string(),
        "selection_handicap": pa.string(),
        "ts_recv_ns": pa.int64(),
        "ts_event_ns": pa.int64(),
    },
    metadata={"type": "BettingInstrument"},
)

TYPE_TO_SCHEMA[OrderBookData] = pa.schema(
Beispiel #37
0
class TestAbstractFileParserStatics:
    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html
        "input_json_type, output_pyarrow_type",
        [
            ("string", pa.large_string()),
            ("number", pa.float64()),
            ("integer", pa.int64()),
            ("object", pa.large_string()),
            ("array", pa.large_string()),
            ("boolean", pa.bool_()),
            ("null", pa.large_string()),
        ],
    )
    def test_json_type_to_pyarrow_type(self, input_json_type,
                                       output_pyarrow_type):
        # Json -> PyArrow direction
        LOGGER.info(
            f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'..."
        )
        assert AbstractFileParser.json_type_to_pyarrow_type(
            input_json_type) == output_pyarrow_type

    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html
        "input_pyarrow_types, output_json_type",
        [
            ((pa.null(), ), "string"),  # null type
            ((pa.bool_(), ), "boolean"),  # boolean type
            (
                (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(),
                 pa.uint16(), pa.uint32(), pa.uint64()),
                "integer",
            ),  # integer types
            ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(
                5, 10), pa.decimal256(3, 8)), "number"),  # number types
            ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(),
              pa.date64()), "string"),  # temporal types
            ((pa.binary(), pa.large_binary()), "string"),  # binary types
            ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()),
             "string"),  # string types
            ((pa.list_(pa.string()), pa.large_list(
                pa.timestamp("us"))), "string"),  # array types
            ((pa.map_(pa.string(), pa.float32()),
              pa.dictionary(pa.int16(), pa.list_(
                  pa.string()))), "string"),  # object types
        ],
    )
    def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types,
                                               output_json_type):
        # PyArrow -> Json direction (reverse=True)
        for typ in input_pyarrow_types:
            LOGGER.info(
                f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'..."
            )
            assert AbstractFileParser.json_type_to_pyarrow_type(
                typ, reverse=True) == output_json_type

    @pytest.mark.parametrize(  # if expecting fail, put pyarrow_schema as None
        "json_schema, pyarrow_schema",
        [
            (
                {
                    "a": "string",
                    "b": "number",
                    "c": "integer",
                    "d": "object",
                    "e": "array",
                    "f": "boolean",
                    "g": "null"
                },
                {
                    "a": pa.large_string(),
                    "b": pa.float64(),
                    "c": pa.int64(),
                    "d": pa.large_string(),
                    "e": pa.large_string(),
                    "f": pa.bool_(),
                    "g": pa.large_string(),
                },
            ),
            ({
                "single_column": "object"
            }, {
                "single_column": pa.large_string()
            }),
            ({}, {}),
            ({
                "a": "NOT A REAL TYPE",
                "b": "another fake type"
            }, {
                "a": pa.large_string(),
                "b": pa.large_string()
            }),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema(self, json_schema, pyarrow_schema):
        # Json -> PyArrow direction
        if pyarrow_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(
                json_schema) == pyarrow_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(json_schema)
                LOGGER.debug(str(e_info))

    @pytest.mark.parametrize(  # if expecting fail, put json_schema as None
        "pyarrow_schema, json_schema",
        [
            (
                {
                    "a": pa.utf8(),
                    "b": pa.float16(),
                    "c": pa.uint32(),
                    "d": pa.map_(pa.string(), pa.float32()),
                    "e": pa.bool_(),
                    "f": pa.date64(),
                },
                {
                    "a": "string",
                    "b": "number",
                    "c": "integer",
                    "d": "string",
                    "e": "boolean",
                    "f": "string"
                },
            ),
            ({
                "single_column": pa.int32()
            }, {
                "single_column": "integer"
            }),
            ({}, {}),
            ({
                "a": "NOT A REAL TYPE",
                "b": "another fake type"
            }, {
                "a": "string",
                "b": "string"
            }),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema,
                                                   json_schema):
        # PyArrow -> Json direction (reverse=True)
        if json_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(
                pyarrow_schema, reverse=True) == json_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(
                    pyarrow_schema, reverse=True)
                LOGGER.debug(str(e_info))
Beispiel #38
0
def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    date_data = [
        [],
        [date(2018, 1, 1), date(2032, 12, 30)],
        [date(2000, 6, 7)],
        None,
        [date(1969, 6, 9), date(1972, 7, 3)]
    ]
    time_data = [
        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
        [],
        [time(22, 5, 59)],
        None,
        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
    ]

    temporal_pairs = [
        (pa.date32(), date_data),
        (pa.date64(), date_data),
        (pa.time32('s'), time_data),
        (pa.time32('ms'), time_data),
        (pa.time64('us'), time_data)
    ]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Beispiel #39
0
def _parquet_schema(dataframe: pd.DataFrame,
                    custom_redshift_columns: dict = None) -> pa.Schema:
    """ Translates pandas dtypes to PyArrow types and creates a Schema from them

    Args:
        dataframe (pd.DataFrame): Dataframe to pull the schema of
        custom_redshift_columns (dict, Optional): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)

    Returns:
        PyArrow Schema of the given dataframe
    """
    fields = []
    for col, dtype in dataframe.dtypes.items():
        dtype = dtype.name
        if dtype == 'object':
            if custom_redshift_columns:
                # Detect if the Pandas object column contains Python decimal objects.
                if "[Decimal(" in str(dataframe[col].values)[:9]:
                    # If Python decimal objects are present, parse out the precision and scale
                    # from the custom_redshift_columns dictionary to use when converting
                    # to PyArrow's decimal128 data type.
                    s = custom_redshift_columns[col]
                    precision = int(s[s.find('DECIMAL(') +
                                      len('DECIMAL('):s.rfind(',')].strip())
                    scale = int(s[s.find(',') + len(','):s.rfind(')')].strip())
                    pa_type = pa.decimal128(precision=precision, scale=scale)
                else:
                    pa_type = pa.string()
            else:
                pa_type = pa.string()
        elif dtype.startswith('int32'):
            pa_type = pa.int32()
        elif dtype.startswith('int64'):
            pa_type = pa.int64()
        elif dtype.startswith('int8'):
            pa_type = pa.int8()
        elif dtype.startswith('float32'):
            pa_type = pa.float32()
        elif dtype.startswith('float64'):
            pa_type = pa.float64()
        elif dtype.startswith('float16'):
            pa_type = pa.float16()
        elif dtype.startswith('datetime'):
            pa_type = pa.timestamp('ns')
        elif dtype.startswith('date'):
            pa_type = pa.date64()
        elif dtype.startswith('category'):
            pa_type = pa.string()
        elif dtype == 'bool':
            pa_type = pa.bool_()
        else:
            raise NotImplementedError(
                f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq."
            )
        fields.append(pa.field(col, pa_type))

    return pa.schema(fields=fields)
Beispiel #40
0
floating_types = st.sampled_from([
    pa.float16(),
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(
        "INT64",
        pyarrow.float16().id:
        "FLOAT64",
        pyarrow.float32().id:
        "FLOAT64",
        pyarrow.float64().id:
        "FLOAT64",
        pyarrow.time32("ms").id:
        "TIME",
        pyarrow.time64("ns").id:
        "TIME",
        pyarrow.timestamp("ns").id:
        "TIMESTAMP",
        pyarrow.date32().id:
        "DATE",
        pyarrow.date64().id:
        "DATETIME",  # because millisecond resolution
        pyarrow.binary().id:
        "BYTES",
        pyarrow.string().id:
        "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id:
        "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER
Beispiel #42
0
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
Beispiel #43
0
    (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue),
    (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value),
    ("string", None, pa.StringScalar, pa.StringValue),
    (b"bytes", None, pa.BinaryScalar, pa.BinaryValue),
    ("largestring", pa.large_string(), pa.LargeStringScalar,
     pa.LargeStringValue),
    (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar,
     pa.LargeBinaryValue),
    (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue),
    ([1, 2, 3], None, pa.ListScalar, pa.ListValue),
    ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar,
     pa.LargeListValue),
    ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar,
     pa.FixedSizeListValue),
    (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value),
    (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value),
    (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue),
    (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'),
     pa.Time32Scalar, pa.Time32Value),
    (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value),
    (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue),
    ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue),
    ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar,
     pa.MapValue),
])
def test_basics(value, ty, klass, deprecated):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, klass)
    assert s.as_py() == value
    assert s == pa.scalar(value, type=ty)
    assert s != value
Beispiel #44
0
def as_column(arbitrary):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * numba device array
    * numpy array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - NumericalColumn for all other inputs.
    """
    from . import numerical, categorical, datetime

    if isinstance(arbitrary, Column):
        if not isinstance(arbitrary, TypedColumnBase):
            # interpret as numeric
            data = arbitrary.view(numerical.NumericalColumn,
                                  dtype=arbitrary.dtype)
        else:
            data = arbitrary

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            mask = cudautils.mask_from_devary(arbitrary)
            data = data.set_mask(mask)

    elif isinstance(arbitrary, np.ndarray):
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        else:
            data = as_column(rmm.to_device(arbitrary))

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            raise NotImplementedError("Strings are not yet supported")
        elif isinstance(arbitrary, pa.NullArray):
            pamask = Buffer(np.empty(0, dtype='int8'))
            padata = Buffer(np.empty(0,
                                     dtype=arbitrary.type.to_pandas_dtype()))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=0,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))
        elif isinstance(arbitrary, pa.DictionaryArray):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    arbitrary.indices.type.to_pandas_dtype()))
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
                dtype="category"  # What's the correct way to specify this?
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        else:
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    np.dtype(arbitrary.type.to_pandas_dtype())))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        data = as_column(pa.array(arbitrary, from_pandas=True))

    elif np.isscalar(arbitrary):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]))

    else:
        data = as_column(pa.array(arbitrary))

    return data
Beispiel #45
0
signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=1, max_value=38),
                         scale=st.integers(min_value=1, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.builds(pa.timestamp,
                            unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
                            tz=tzst.timezones())
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(null_type, bool_type, binary_type, string_type,
                            large_binary_type, large_string_type,
                            numeric_types, temporal_types)

metadata = st.dictionaries(st.text(), st.text())
Beispiel #46
0
def pyarrow_datatype_from_dict(json_dict: Dict[str, Any]) -> pyarrow.DataType:
    """
    Create a DataType in PyArrow format from a Schema json format.

    :param json_dict: the DataType in json format
    :return: the DataType in PyArrow format
    """
    type_class = json_dict["type"]["name"]
    if type_class == "dictionary":
        key_type = json_dict["dictionary"]["indexType"]
        value_type = json_dict["children"][0]
        key_type = pyarrow_datatype_from_dict(key_type)
        value_type = pyarrow_datatype_from_dict(value_type)
        return pyarrow.map_(key_type, value_type)
    elif "dictionary" in json_dict:
        key_type = {
            "name": "key",
            "type": json_dict["dictionary"]["indexType"],
            "nullable": json_dict["nullable"],
        }
        key = pyarrow_datatype_from_dict(key_type)
        if type_class == "list":
            value_type = {
                "name": "val",
                "type": json_dict["dictionary"]["indexType"],
                "nullable": json_dict["nullable"],
            }
            return pyarrow.map_(
                key,
                pyarrow.list_(
                    pyarrow.field(
                        "entries",
                        pyarrow.struct([pyarrow_field_from_dict(value_type)
                                        ]))),
            )
        value_type = {
            "name": "value",
            "type": json_dict["type"],
            "nullable": json_dict["nullable"],
        }
        return pyarrow.map_(key, pyarrow_datatype_from_dict(value_type))
    elif type_class == "list":
        field = json_dict["children"][0]
        element_type = pyarrow_datatype_from_dict(field)
        return pyarrow.list_(pyarrow.field("item", element_type))
    elif type_class == "struct":
        fields = [
            pyarrow_field_from_dict(field) for field in json_dict["children"]
        ]
        return pyarrow.struct(fields)
    elif type_class == "int":
        return pyarrow.type_for_alias(
            f'{type_class}{json_dict["type"]["bitWidth"]}')
    elif type_class == "date":
        type_info = json_dict["type"]
        if type_info["unit"] == "DAY":
            return pyarrow.date32()
        else:
            return pyarrow.date64()
    elif type_class == "time":
        type_info = json_dict["type"]
        if type_info["unit"] == "MICROSECOND":
            unit = "us"
        elif type_info["unit"] == "NANOSECOND":
            unit = "ns"
        elif type_info["unit"] == "MILLISECOND":
            unit = "ms"
        else:
            unit = "s"
        return pyarrow.type_for_alias(
            f'{type_class}{type_info["bitWidth"]}[{unit}]')
    elif type_class == "timestamp":
        type_info = json_dict["type"]
        if "unit" in type_info:
            if type_info["unit"] == "MICROSECOND":
                unit = "us"
            elif type_info["unit"] == "NANOSECOND":
                unit = "ns"
            elif type_info["unit"] == "MILLISECOND":
                unit = "ms"
            elif type_info["unit"] == "SECOND":
                unit = "s"
        else:
            unit = "ns"
        return pyarrow.type_for_alias(f"{type_class}[{unit}]")
    elif type_class.startswith("decimal"):
        type_info = json_dict["type"]
        return pyarrow.decimal128(precision=type_info["precision"],
                                  scale=type_info["scale"])
    elif type_class.startswith("floatingpoint"):
        type_info = json_dict["type"]
        if type_info["precision"] == "HALF":
            return pyarrow.float16()
        elif type_info["precision"] == "SINGLE":
            return pyarrow.float32()
        elif type_info["precision"] == "DOUBLE":
            return pyarrow.float64()
    else:
        return pyarrow.type_for_alias(type_class)
Beispiel #47
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table,
                     expected=expected,
                     version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0', flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Beispiel #48
0
def test_sequence_date():
    data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
            datetime.date(2040, 2, 26)]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.date32()
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.date(2000, 1, 1)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.date(1970, 1, 1)
    assert arr[3].as_py() == datetime.date(2040, 2, 26)


@pytest.mark.parametrize('input',
                         [(pa.date32(), [10957, None]),
                          (pa.date64(), [10957 * 86400000, None])])
def test_sequence_explicit_types(input):
    t, ex_values = input
    data = [datetime.date(2000, 1, 1), None]
    arr = pa.array(data, type=t)
    arr2 = pa.array(ex_values, type=t)

    for x in [arr, arr2]:
        assert len(x) == 2
        assert x.type == t
        assert x.null_count == 1
        assert x[0].as_py() == datetime.date(2000, 1, 1)
        assert x[1] is pa.NA


def test_date32_overflow():
Beispiel #49
0
    def clean_data_common(self, processed_data, raw_data):
        """Fix the type and default value of of each extracted field

        This routine is common to all services. It ensures that all the missing
        fields, as defined by the schema, are added to the records extracted.
        Furthermore, each field is set to the specified type.
        """

        # Build default data structure
        schema_rec = {}
        def_vals = self._get_default_vals()

        ptype_map = {
            pa.string(): str,
            pa.int32(): int,
            pa.int64(): int,
            pa.float32(): float,
            pa.float64(): float,
            pa.date64(): float,
            pa.list_(pa.string()): list,
            pa.list_(pa.int64()): list,
            pa.bool_(): bool,
        }

        for field in self.schema:
            default = def_vals[field.type]
            schema_rec.update({field.name: default})

        if isinstance(raw_data, list):
            read_from = raw_data[0]
        else:
            read_from = raw_data
        for entry in processed_data:
            entry.update({"hostname": read_from["hostname"]})
            entry.update({"namespace": read_from["namespace"]})
            entry.update({"timestamp": read_from["timestamp"]})
            entry.update({"sqvers": self.version})
            for fld in schema_rec:
                if fld not in entry:
                    if fld == "active":
                        entry.update({fld: True})
                    else:
                        entry.update({fld: schema_rec[fld]})
                else:
                    fld_type = self.schema.field(fld).type
                    if not isinstance(entry[fld], ptype_map[fld_type]):
                        try:
                            entry[fld] = ptype_map[fld_type](entry[fld])
                        except (ValueError, TypeError):
                            entry[fld] = schema_rec[fld]
                    elif isinstance(entry[fld], list):
                        for i, ele in enumerate(entry[fld]):
                            if not isinstance(ele,
                                              ptype_map[fld_type.value_type]):
                                try:
                                    if ptype_map[fld_type.value_type] == int:
                                        entry[fld][i] = int(entry[fld][i])
                                    elif ptype_map[fld_type.value_type] == str:
                                        entry[fld][i] = str(entry[fld][i])
                                    else:
                                        raise ValueError
                                except (ValueError, TypeError):
                                    entry[fld][i] = schema_rec[fld]
        return processed_data
Beispiel #50
0
def test_fields_hashable():
    in_dict = {}
    fields = [
        pa.field('a', pa.int64()),
        pa.field('a', pa.int32()),
        pa.field('b', pa.int32())
    ]
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i


@pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32),
                                          (pa.date64(), types.is_date64),
                                          (pa.time32('s'), types.is_time32),
                                          (pa.time64('ns'), types.is_time64),
                                          (pa.int8(), types.is_int8),
                                          (pa.int16(), types.is_int16),
                                          (pa.int32(), types.is_int32),
                                          (pa.int64(), types.is_int64),
                                          (pa.uint8(), types.is_uint8),
                                          (pa.uint16(), types.is_uint16),
                                          (pa.uint32(), types.is_uint32),
                                          (pa.uint64(), types.is_uint64),
                                          (pa.float16(), types.is_float16),
                                          (pa.float32(), types.is_float32),
                                          (pa.float64(), types.is_float64)])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)
Beispiel #51
0
    SIGNED_INT_PYARROW_DTYPES = [
        pa.uint8(), pa.int16(),
        pa.int32(), pa.uint64()
    ]
    ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES

    FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
    STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()]

    TIME_PYARROW_DTYPES = [
        pa.time32("s"),
        pa.time32("ms"),
        pa.time64("us"),
        pa.time64("ns"),
    ]
    DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
    DATETIME_PYARROW_DTYPES = [
        pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"]
        for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
    ]
    TIMEDELTA_PYARROW_DTYPES = [
        pa.duration(unit) for unit in ["s", "ms", "us", "ns"]
    ]

    BOOL_PYARROW_DTYPES = [pa.bool_()]

    # TODO: Add container like pyarrow types:
    #  https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
    ALL_PYARROW_DTYPES = (ALL_INT_PYARROW_DTYPES + FLOAT_PYARROW_DTYPES +
                          TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES +
                          DATETIME_PYARROW_DTYPES + TIMEDELTA_PYARROW_DTYPES +