def test_sequence_timestamp_from_int_with_unit():
    data = [1]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"

    with pytest.raises(pa.ArrowException):
        class CustomClass():
            pass
        pa.array([1, CustomClass()], type=ns)
        pa.array([1, CustomClass()], type=pa.date32())
        pa.array([1, CustomClass()], type=pa.date64())
    def test_timestamps_notimezone_nulls(self):
        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123',
                None,
                '2010-08-13T05:46:57.437'],
                dtype='datetime64[ms]')
            })
        field = pa.field('datetime64', pa.timestamp('ms'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=True,
            expected_schema=schema,
        )

        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123456789',
                None,
                '2010-08-13T05:46:57.437699912'],
                dtype='datetime64[ns]')
            })
        field = pa.field('datetime64', pa.timestamp('ns'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=False,
            expected_schema=schema,
        )
Exemple #3
0
def test_sequence_timestamp_with_unit():
    data = [
        datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Exemple #5
0
def test_cast_timestamp_unit():
    # ARROW-1680
    val = datetime.datetime.now()
    s = pd.Series([val])
    s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York')

    us_with_tz = pa.timestamp('us', tz='America/New_York')

    arr = pa.Array.from_pandas(s_nyc, type=us_with_tz)

    # ARROW-1906
    assert arr.type == us_with_tz

    arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us'))

    assert arr[0].as_py() == s_nyc[0]
    assert arr2[0].as_py() == s[0]

    # Disallow truncation
    arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
    expected = pa.array([123], type='int64').cast(pa.timestamp('s'))

    target = pa.timestamp('s')
    with pytest.raises(ValueError):
        arr.cast(target)

    result = arr.cast(target, safe=False)
    assert result.equals(expected)
Exemple #6
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Exemple #7
0
def test_timestamp():
    for unit in ('s', 'ms', 'us', 'ns'):
        for tz in (None, 'UTC', 'Europe/Paris'):
            ty = pa.timestamp(unit, tz=tz)
            assert ty.unit == unit
            assert ty.tz == tz

    for invalid_unit in ('m', 'arbit', 'rary'):
        with pytest.raises(ValueError, match='Invalid TimeUnit string'):
            pa.timestamp(invalid_unit)
Exemple #8
0
def test_type_from_numpy_dtype_timestamps():
    cases = [
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt
Exemple #9
0
def test_cast_timestamp_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.timestamp('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)
Exemple #10
0
    def test_timestamp(self):
        import pandas as pd
        arr = pd.date_range('2000-01-01 12:34:56', periods=10).values

        units = ['ns', 'us', 'ms', 's']

        for i, unit in enumerate(units):
            dtype = 'datetime64[{0}]'.format(unit)
            arrow_arr = pa.Array.from_pandas(arr.astype(dtype))
            expected = pd.Timestamp('2000-01-01 12:34:56')

            assert arrow_arr[0].as_py() == expected
            assert arrow_arr[0].value * 1000**i == expected.value

            tz = 'America/New_York'
            arrow_type = pa.timestamp(unit, tz=tz)

            dtype = 'datetime64[{0}]'.format(unit)
            arrow_arr = pa.Array.from_pandas(arr.astype(dtype),
                                             type=arrow_type)
            expected = (pd.Timestamp('2000-01-01 12:34:56')
                        .tz_localize('utc')
                        .tz_convert(tz))

            assert arrow_arr[0].as_py() == expected
            assert arrow_arr[0].value * 1000**i == expected.value
Exemple #11
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
def _add_any_metadata(table, pandas_metadata):
    modified_columns = {}

    schema = table.schema

    # Add time zones
    for i, col_meta in enumerate(pandas_metadata['columns']):
        if col_meta['pandas_type'] == 'datetimetz':
            col = table[i]
            converted = col.to_pandas()
            tz = col_meta['metadata']['timezone']
            tz_aware_type = pa.timestamp('ns', tz=tz)
            with_metadata = pa.Array.from_pandas(converted.values,
                                                 type=tz_aware_type)

            field = pa.field(schema[i].name, tz_aware_type)
            modified_columns[i] = pa.Column.from_array(field,
                                                       with_metadata)

    if len(modified_columns) > 0:
        columns = []
        for i in range(len(table.schema)):
            if i in modified_columns:
                columns.append(modified_columns[i])
            else:
                columns.append(table[i])
        return pa.Table.from_arrays(columns)
    else:
        return table
Exemple #13
0
def test_datetime_subclassing():
    class MyDate(datetime.date):
        pass
    data = [
        MyDate(2007, 7, 13),
    ]
    date_type = pa.date32()
    arr_date = pa.array(data, type=date_type)
    assert len(arr_date) == 1
    assert arr_date.type == date_type
    assert arr_date[0].as_py() == datetime.date(2007, 7, 13)

    class MyDatetime(datetime.datetime):
        pass

    data = [
        MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)
Exemple #14
0
def test_timestamp_units_from_list(unit):
    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
    a1 = pa.array([x])
    a2 = pa.array([x], type=pa.timestamp(unit))

    assert a1.type == a2.type
    assert a1.type.unit == unit
    assert a1[0] == a2[0]
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Exemple #16
0
 def test_simple_timestamps(self):
     # Infer a timestamp column
     rows = b"a,b\n1970,1970-01-01\n1989,1989-07-14\n"
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.timestamp('s'))])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1970, 1989],
         'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)],
         }
Exemple #17
0
def test_array_from_list_of_timestamps(unit):
    n = np.datetime64('NaT', unit)
    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
    y = np.datetime64('2018-11-22 12:24:48.111111111', unit)

    a1 = pa.array([n, x, y])
    a2 = pa.array([n, x, y], type=pa.timestamp(unit))

    assert a1.type == a2.type
    assert a1.type.unit == unit
    assert a1[0] == a2[0]
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Exemple #19
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Exemple #20
0
Fichier : jvm.py Projet : rok/arrow
def _from_jvm_timestamp_type(jvm_type):
    """
    Convert a JVM timestamp type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    timezone = jvm_type.getTimezone()
    if time_unit == 'SECOND':
        return pa.timestamp('s', tz=timezone)
    elif time_unit == 'MILLISECOND':
        return pa.timestamp('ms', tz=timezone)
    elif time_unit == 'MICROSECOND':
        return pa.timestamp('us', tz=timezone)
    elif time_unit == 'NANOSECOND':
        return pa.timestamp('ns', tz=timezone)
 def test_timestamps_notimezone_no_nulls(self):
     df = pd.DataFrame({
         'datetime64': np.array([
             '2007-07-13T01:23:34.123456789',
             '2006-01-13T12:34:56.432539784',
             '2010-08-13T05:46:57.437699912'],
             dtype='datetime64[ns]')
     })
     field = pa.field('datetime64', pa.timestamp('ns'))
     schema = pa.schema([field])
     self._check_pandas_roundtrip(
         df,
         expected_schema=schema,
     )
Exemple #22
0
def get_datetimetz_type(values, dtype, type_):
    if values.dtype.type != np.datetime64:
        return values, type_

    if _pandas_api.is_datetimetz(dtype) and type_ is None:
        # If no user type passed, construct a tz-aware timestamp type
        tz = dtype.tz
        unit = dtype.unit
        type_ = pa.timestamp(unit, tz)
    elif type_ is None:
        # Trust the NumPy dtype
        type_ = pa.from_numpy_dtype(values.dtype)

    return values, type_
Exemple #23
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Exemple #24
0
 def test_timestamp(self):
     data = [
         datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
         None,
         datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
         datetime.datetime(2010, 8, 13, 5, 46, 57, 437699),
     ]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.type == pyarrow.timestamp()
     assert arr.null_count == 1
     assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
     assert arr[1].as_py() is None
     assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
     assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
    def test_timestamps_notimezone_no_nulls(self):
        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123',
                '2006-01-13T12:34:56.432',
                '2010-08-13T05:46:57.437'],
                dtype='datetime64[ms]')
            })
        field = A.Field.from_py('datetime64', A.timestamp('ms'))
        schema = A.Schema.from_fields([field])
        self._check_pandas_roundtrip(df, timestamps_to_ms=True,
                                     expected_schema=schema)

        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123456789',
                '2006-01-13T12:34:56.432539784',
                '2010-08-13T05:46:57.437699912'],
                dtype='datetime64[ns]')
            })
        field = A.Field.from_py('datetime64', A.timestamp('ns'))
        schema = A.Schema.from_fields([field])
        self._check_pandas_roundtrip(df, timestamps_to_ms=False,
                                     expected_schema=schema)
def get_datetimetz_type(values, dtype, type_):
    from pyarrow.compat import DatetimeTZDtype

    if values.dtype.type != np.datetime64:
        return values, type_

    if isinstance(dtype, DatetimeTZDtype):
        tz = dtype.tz
        unit = dtype.unit
        type_ = pa.timestamp(unit, tz)
    elif type_ is None:
        # Trust the NumPy dtype
        type_ = pa.from_numpy_dtype(values.dtype)

    return values, type_
Exemple #27
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Exemple #28
0
def _add_any_metadata(table, pandas_metadata):
    modified_columns = {}

    schema = table.schema

    index_columns = pandas_metadata['index_columns']
    n_index_levels = len(index_columns)
    n_columns = len(pandas_metadata['columns']) - n_index_levels

    # Add time zones
    for i, col_meta in enumerate(pandas_metadata['columns']):

        raw_name = col_meta.get('field_name')
        if not raw_name:
            # deal with metadata written with arrow < 0.8
            raw_name = col_meta['name']
            if i >= n_columns:
                # index columns
                raw_name = index_columns[i - n_columns]
            if raw_name is None:
                raw_name = 'None'

        idx = schema.get_field_index(raw_name)
        if idx != -1:
            if col_meta['pandas_type'] == 'datetimetz':
                col = table[idx]
                converted = col.to_pandas()
                tz = col_meta['metadata']['timezone']
                tz_aware_type = pa.timestamp('ns', tz=tz)
                with_metadata = pa.Array.from_pandas(converted.values,
                                                     type=tz_aware_type)

                field = pa.field(schema[idx].name, tz_aware_type)
                modified_columns[idx] = pa.Column.from_array(field,
                                                             with_metadata)

    if len(modified_columns) > 0:
        columns = []
        for i in range(len(table.schema)):
            if i in modified_columns:
                columns.append(modified_columns[i])
            else:
                columns.append(table[i])
        return pa.Table.from_arrays(columns)
    else:
        return table
Exemple #29
0
def test_sequence_numpy_timestamp():
    data = [
        np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
        None,
        np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
        np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
    ]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.timestamp('us')
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                               23, 34, 123456)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
                                               34, 56, 432539)
    assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
                                               46, 57, 437699)
def test_coerce_timestamps(tmpdir):
    from collections import OrderedDict
    # ARROW-622
    arrays = OrderedDict()
    fields = [pa.field('datetime64',
                       pa.list_(pa.timestamp('ms')))]
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, schema=schema)

    _write_table(arrow_table, filename.strpath, version="2.0",
                 coerce_timestamps='us')
    table_read = _read_table(filename.strpath)
    df_read = table_read.to_pandas()

    df_expected = df.copy()
    for i, x in enumerate(df_expected['datetime64']):
        if isinstance(x, np.ndarray):
            df_expected['datetime64'][i] = x.astype('M8[us]')

    tm.assert_frame_equal(df_expected, df_read)

    with pytest.raises(ValueError):
        _write_table(arrow_table, filename.strpath, version="2.0",
                     coerce_timestamps='unknown')
Exemple #31
0
def _parquet_schema(dataframe: pd.DataFrame,
                    custom_redshift_columns: dict = None):
    """ Translates pandas dtypes to PyArrow types and creates a Schema from them

    Args:
        dataframe (pd.DataFrame): Dataframe to pull the schema of
        custom_redshift_columns (dict, Optional): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)

    Returns:
        PyArrow Schema of the given dataframe
        Potentially modified Dataframe
    """
    fields = []
    for col, dtype in dataframe.dtypes.items():
        dtype = dtype.name
        if dtype == 'object':
            if custom_redshift_columns:
                # Detect if the Pandas object column contains Python decimal objects.
                if "[Decimal(" in str(dataframe[col].values)[:9]:
                    # If Python decimal objects are present, parse out the precision and scale
                    # from the custom_redshift_columns dictionary to use when converting
                    # to PyArrow's decimal128 data type.
                    s = custom_redshift_columns[col]
                    precision = int(s[s.find('DECIMAL(') +
                                      len('DECIMAL('):s.rfind(',')].strip())
                    scale = int(s[s.find(',') + len(','):s.rfind(')')].strip())
                    pa_type = pa.decimal128(precision=precision, scale=scale)
                else:
                    pa_type = pa.string()
            else:
                pa_type = pa.string()
        elif dtype.startswith('int32'):
            pa_type = pa.int32()
        elif dtype.startswith('int64'):
            pa_type = pa.int64()
        elif dtype.startswith('int8'):
            pa_type = pa.int8()
        elif dtype.startswith('Int32'):
            dataframe = dataframe.astype({col: 'object'})
            pa_type = pa.int32()
        elif dtype.startswith('Int64'):
            dataframe = dataframe.astype({col: 'object'})
            pa_type = pa.int64()
        elif dtype.startswith('float32'):
            pa_type = pa.float32()
        elif dtype.startswith('float64'):
            pa_type = pa.float64()
        elif dtype.startswith('float16'):
            pa_type = pa.float16()
        elif dtype.startswith('datetime'):
            pa_type = pa.timestamp('ns')
        elif dtype.startswith('date'):
            pa_type = pa.date64()
        elif dtype.startswith('category'):
            pa_type = pa.string()
        elif dtype == 'bool':
            pa_type = pa.bool_()
        else:
            raise NotImplementedError(
                f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq."
            )
        fields.append(pa.field(col, pa_type))

    return (pa.schema(fields=fields), dataframe)
Exemple #32
0
def test_type_timestamp_with_tz():
    tz = 'America/Los_Angeles'
    t = pa.timestamp('ns', tz=tz)
    assert t.unit == 'ns'
    assert t.tz == tz
Exemple #33
0
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=0, max_value=38),
                         scale=st.integers(min_value=0, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.sampled_from([
    pa.timestamp('s'),
    pa.timestamp('ms'),
    pa.timestamp('us'),
    pa.timestamp('ns')
])
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(null_type, bool_type, binary_type, string_type,
                            numeric_types, temporal_types)

metadata = st.dictionaries(st.text(), st.text())


@st.defines_strategy
def fields(type_strategy=primitive_types):
    return st.builds(pa.field,
def generate_type_mapper(
    pd_boolean=None,
    pd_integer=None,
    pd_string=None,
    pd_date_type=None,
    pd_timestamp_type=None,
):
    """Specifies the pyarrow data types mapping to corresponding Pandas data types.

    Args:
        pd_boolean: if not noe, use the new Pandas bool type. Defaults to None.
        pd_integer: if not None, use the new Pandas nullable integer type rather than
            defaulting to floats. Defaults to None.
        pd_string: if not None, use the new Pandas str type. Defaults to None.
        pd_date_type: Defaults to None.
        pd_timestamp_type: Defaults to None.

    Returns:
        Type mappings between pyarrow and pandas data types.
    """
    tm = {}
    if pd_boolean:
        bool_map = {pa.bool_(): pd.BooleanDtype()}
        tm = {**tm, **bool_map}
    if pd_string:
        string_map = {pa.string(): pd.StringDtype()}
        tm = {**tm, **string_map}

    if pd_integer:
        int_map = {
            pa.int8(): pd.Int64Dtype(),
            pa.int16(): pd.Int64Dtype(),
            pa.int32(): pd.Int64Dtype(),
            pa.int64(): pd.Int64Dtype(),
            pa.uint8(): pd.Int64Dtype(),
            pa.uint16(): pd.Int64Dtype(),
            pa.uint32(): pd.Int64Dtype(),
            pa.uint64(): pd.Int64Dtype(),
        }
        tm = {**tm, **int_map}
    else:
        # No brackets for either keys or values in this dictionary
        # This lets types_mapper understand the numpy data type
        float_map = {
            pa.int8: np.float64,
            pa.int16: np.float64,
            pa.int32: np.float64,
            pa.int64: np.float64,
            pa.uint8: np.float64,
            pa.uint16: np.float64,
            pa.uint32: np.float64,
            pa.uint64: np.float64,
        }
        tm = {**tm, **float_map}

    if pd_date_type == "pd_period":
        date_map = {pa.date64(): pd.PeriodDtype("ms")}
        tm = {**tm, **date_map}

    if pd_timestamp_type == "pd_period":
        datetime_map = {
            pa.timestamp("s"): pd.PeriodDtype("s"),
            pa.timestamp("ms"): pd.PeriodDtype("ms"),
            pa.timestamp("us"): pd.PeriodDtype("us"),
            pa.timestamp("ns"): pd.PeriodDtype("ns"),
        }
        tm = {**tm, **datetime_map}
    if tm:
        return tm.get
    else:
        return None
Exemple #35
0
        "Trying to update an index with the wrong column. Got `another_col` but expected `col`"
    )


@pytest.mark.parametrize(
    "dtype",
    [
        pa.binary(),
        pa.bool_(),
        pa.date32(),
        pa.float32(),
        pa.float64(),
        pa.int64(),
        pa.int8(),
        pa.string(),
        pa.timestamp("ns"),
    ],
)
def test_index_empty(store, dtype):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct={},
                                    dtype=dtype,
                                    index_storage_key=storage_key)
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col",
                                    index_storage_key=key1).load(store)
    assert index1 == index2

    index3 = pickle.loads(pickle.dumps(index1))
Exemple #36
0
    def __init__(  # pylint: disable=too-many-locals,too-many-branches
        self,
        data: DbapiResult,
        cursor_description: DbapiDescription,
        db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
    ):
        self.db_engine_spec = db_engine_spec
        data = data or []
        column_names: List[str] = []
        pa_data: List[pa.Array] = []
        deduped_cursor_desc: List[Tuple[Any, ...]] = []
        numpy_dtype: List[Tuple[str, ...]] = []
        stringified_arr: np.ndarray

        if cursor_description:
            # get deduped list of column names
            column_names = dedup([col[0] for col in cursor_description])

            # fix cursor descriptor with the deduped names
            deduped_cursor_desc = [
                tuple([column_name, *list(description)[1:]]) for column_name,
                description in zip(column_names, cursor_description)
            ]

            # generate numpy structured array dtype
            numpy_dtype = [(column_name, "object")
                           for column_name in column_names]

        # only do expensive recasting if datatype is not standard list of tuples
        if data and (not isinstance(data, list)
                     or not isinstance(data[0], tuple)):
            data = [tuple(row) for row in data]
        array = np.array(data, dtype=numpy_dtype)
        if array.size > 0:
            for column in column_names:
                try:
                    pa_data.append(pa.array(array[column].tolist()))
                except (
                        pa.lib.ArrowInvalid,
                        pa.lib.ArrowTypeError,
                        pa.lib.ArrowNotImplementedError,
                        TypeError,  # this is super hackey,
                        # https://issues.apache.org/jira/browse/ARROW-7855
                ):
                    # attempt serialization of values as strings
                    stringified_arr = stringify_values(array[column])
                    pa_data.append(pa.array(stringified_arr.tolist()))

        if pa_data:  # pylint: disable=too-many-nested-blocks
            for i, column in enumerate(column_names):
                if pa.types.is_nested(pa_data[i].type):
                    # TODO: revisit nested column serialization once nested types
                    #  are added as a natively supported column type in Superset
                    #  (superset.utils.core.DbColumnType).
                    stringified_arr = stringify_values(array[column])
                    pa_data[i] = pa.array(stringified_arr.tolist())

                elif pa.types.is_temporal(pa_data[i].type):
                    # workaround for bug converting
                    # `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
                    # related: https://issues.apache.org/jira/browse/ARROW-5248
                    sample = self.first_nonempty(array[column])
                    if sample and isinstance(sample, datetime.datetime):
                        try:
                            if sample.tzinfo:
                                tz = sample.tzinfo
                                series = pd.Series(array[column],
                                                   dtype="datetime64[ns]")
                                series = pd.to_datetime(series).dt.tz_localize(
                                    tz)
                                pa_data[i] = pa.Array.from_pandas(
                                    series, type=pa.timestamp("ns", tz=tz))
                        except Exception as ex:  # pylint: disable=broad-except
                            logger.exception(ex)

        self.table = pa.Table.from_arrays(pa_data, names=column_names)
        self._type_dict: Dict[str, Any] = {}
        try:
            # The driver may not be passing a cursor.description
            self._type_dict = {
                col: db_engine_spec.get_datatype(deduped_cursor_desc[i][1])
                for i, col in enumerate(column_names) if deduped_cursor_desc
            }
        except Exception as ex:  # pylint: disable=broad-except
            logger.exception(ex)
Exemple #37
0
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)
    'total_amt': 'total_amount',
    'tpep_dropoff_datetime': 'dropoff_datetime',
    'tpep_pickup_datetime': 'pickup_datetime',
    'trip_distance': 'trip_distance',
    'trip_dropoff_datetime': 'dropoff_datetime',
    'trip_pickup_datetime': 'pickup_datetime',
    'vendor_id': 'vendor',
    'vendor_name': 'vendor',
    'vendorid': 'vendor',
    'trip_type': 'trip_type',
    'lpep_dropoff_datetime': 'dropoff_datetime',
    'lpep_pickup_datetime': 'pickup_datetime'
}

arrow_schema = pa.schema([
    ('pickup_datetime', pa.timestamp('ns')),
    ('dropoff_datetime', pa.timestamp('ns')),
    ('store_and_forward', pa.int8()),
    ('passenger_count', pa.int8()),
    ('trip_distance', pa.float32()),
    ('fare_amount', pa.float32()),
    ('tip_amount', pa.float32()),
    ('total_amount', pa.float32()),
    ('payment_type', pa.string()),
    ('trip_type', pa.string()),
    ('company', pa.string()),
    ('trip_duration_minutes', pa.float32()),
    ('year', pa.int16()),
    ('pickup_borough', pa.string()),
    ('pickup_zone', pa.string()),
    ('pickup_location_id', pa.int16()),
Exemple #39
0
def test_timestamp_restore_timezone():
    # ARROW-5888, restore timezone from serialized metadata
    ty = pa.timestamp('ms', tz='America/New_York')
    arr = pa.array([1, 2, 3], type=ty)
    t = pa.table([arr], names=['f0'])
    _check_roundtrip(t)
Exemple #40
0
def test_date_time_types(tempdir):
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.6')

    t0 = pa.timestamp('ms')
    data0 = np.arange(4, dtype='int64')
    a0 = pa.array(data0, type=t0)

    t1 = pa.timestamp('us')
    data1 = np.arange(4, dtype='int64')
    a1 = pa.array(data1, type=t1)

    t2 = pa.timestamp('ns')
    data2 = np.arange(4, dtype='int64')
    a2 = pa.array(data2, type=t2)

    table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]'])
    expected = pa.Table.from_arrays([a0, a1, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int64 for all timestamps supported by default
    filename = tempdir / 'int64_timestamps.parquet'
    _write_table(table, filename, version='2.6')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT64'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    t0_ns = pa.timestamp('ns')
    data0_ns = np.array(data0 * 1000000, dtype='int64')
    a0_ns = pa.array(data0_ns, type=t0_ns)

    t1_ns = pa.timestamp('ns')
    data1_ns = np.array(data1 * 1000, dtype='int64')
    a1_ns = pa.array(data1_ns, type=t1_ns)

    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int96 nanosecond timestamps produced upon request
    filename = tempdir / 'explicit_int96_timestamps.parquet'
    _write_table(table,
                 filename,
                 version='2.6',
                 use_deprecated_int96_timestamps=True)
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    # int96 nanosecond timestamps implied by flavor 'spark'
    filename = tempdir / 'spark_int96_timestamps.parquet'
    _write_table(table, filename, version='2.6', flavor='spark')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)
Exemple #41
0
import vaex.utils

supported_arrow_array_types = (pa.Array, pa.ChunkedArray)
supported_array_types = (np.ndarray, ) + supported_arrow_array_types
string_types = [pa.string(), pa.large_string()]
_type_names_int = [
    "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"
]
_type_names = ["float64", "float32"] + _type_names_int
map_arrow_to_numpy = {
    getattr(pa, name)(): np.dtype(name)
    for name in _type_names
}
map_arrow_to_numpy[pa.bool_()] = np.dtype("?")
for unit in 's ms us ns'.split():
    map_arrow_to_numpy[pa.timestamp(unit)] = np.dtype(f"datetime64[{unit}]")


def full(n, value, dtype):
    from .datatype import DataType
    dtype = DataType(dtype)
    values = np.full(n, value, dtype=dtype.numpy)
    if dtype.is_arrow:
        return pa.array(values)
    else:
        return values


def is_arrow_array(ar):
    return isinstance(ar, supported_arrow_array_types)
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    expected_names = ["my_int", "my_double", "my_date", "my_decimal"]
    expected_types = [
        pa.int64(),
        pa.float64(),
        pa.date64(),
        pa.decimal128(10, 2)
    ]
    assert schema1.names == expected_names

    checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)]
    assert all(checks1)

    # Do schema2 assertions
    expected_names.append("my_timestamp")
    expected_types.append(pa.timestamp("s"))

    assert schema2.names == expected_names

    checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)]
    assert all(checks2)

    # Also check specific type properties
    assert schema2.field("my_decimal").type.precision == 10
    assert schema2.field("my_decimal").type.scale == 2
    assert schema2.field("my_timestamp").type.unit == "s"
 ("int32", pa.int32()),
 ("int64", pa.int64()),
 ("uint8", pa.uint8()),
 ("uint16", pa.uint16()),
 ("uint32", pa.uint32()),
 ("uint64", pa.uint64()),
 ("float16", pa.float16()),
 ("float32", pa.float32()),
 ("float64", pa.float64()),
 ("decimal128(38,1)", pa.decimal128(38, 1)),
 ("decimal128(1,2)", pa.decimal128(1, 2)),
 ("time32(s)", pa.time32("s")),
 ("time32(ms)", pa.time32("ms")),
 ("time64(us)", pa.time64("us")),
 ("time64(ns)", pa.time64("ns")),
 ("timestamp(s)", pa.timestamp("s")),
 ("timestamp(ms)", pa.timestamp("ms")),
 ("timestamp(us)", pa.timestamp("us")),
 ("timestamp(ns)", pa.timestamp("ns")),
 ("date32", pa.date32()),
 ("date64", pa.date64()),
 ("string", pa.string()),
 ("large_string", pa.large_string()),
 ("utf8", pa.utf8()),
 ("large_utf8", pa.large_utf8()),
 ("binary", pa.binary()),
 ("binary(128)", pa.binary(128)),
 ("large_binary", pa.large_binary()),
 ("struct<num:int64>", pa.struct([("num", pa.int64())])),
 ("list<int64>", pa.list_(pa.int64())),
 ("list_<list<int64>>", pa.list_(pa.list_(pa.int64()))),
Exemple #44
0
def test_get_eq_func():
    for t in [
            pa.int8(),
            pa.int16(),
            pa.int32(),
            pa.int64(),
            pa.uint8(),
            pa.uint16(),
            pa.uint32(),
            pa.uint64(),
    ]:
        assert not get_eq_func(t)(0, 1)
        assert not get_eq_func(t)(None, 1)
        assert get_eq_func(t)(1, 1)
        assert get_eq_func(t)(None, None)
    t = pa.null()
    assert get_eq_func(t)("0", "1")
    assert get_eq_func(t)(None, "1")
    assert get_eq_func(t)("1", "1")
    assert get_eq_func(t)(None, None)
    t = pa.string()
    assert not get_eq_func(t)("0", "1")
    assert not get_eq_func(t)(None, "1")
    assert get_eq_func(t)("1", "1")
    assert get_eq_func(t)(None, None)
    t = pa.bool_()
    assert not get_eq_func(t)(False, True)
    assert not get_eq_func(t)(None, False)
    assert not get_eq_func(t)(None, True)
    assert get_eq_func(t)(True, True)
    assert get_eq_func(t)(False, False)
    assert get_eq_func(t)(None, None)
    for t in [pa.float16(), pa.float32(), pa.float64()]:
        assert not get_eq_func(t)(0.0, 1.1)
        assert get_eq_func(t)(1.1, 1.1)
        assert get_eq_func(t)(None, float("nan"))
        for n in [None, float("nan"), float("inf"), float("-inf")]:
            assert not get_eq_func(t)(None, 1.1)
            assert get_eq_func(t)(None, None)
    for t in [pa.timestamp("ns")]:
        for n in [None, pd.NaT]:
            assert not get_eq_func(t)(datetime(2020, 1, 1, 0),
                                      datetime(2020, 1, 1, 1))
            assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(datetime(2020, 1, 1, 1),
                                  datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(n, n)
    assert get_eq_func(pa.timestamp("ns"))(None, pd.NaT)
    for t in [pa.date32()]:
        for n in [None, pd.NaT]:
            assert get_eq_func(t)(datetime(2020, 1, 1, 0),
                                  datetime(2020, 1, 1, 1))
            assert not get_eq_func(t)(datetime(2020, 1, 1), datetime(
                2020, 1, 2).date())
            assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(datetime(2020, 1, 1).date(),
                                  datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(n, n)
    t = pa.struct([pa.field("a", pa.int32())])
    assert not get_eq_func(t)(dict(a=0), dict(a=1))
    assert not get_eq_func(t)(None, dict(a=1))
    assert get_eq_func(t)(dict(a=1), dict(a=1))
    assert get_eq_func(t)(None, None)
    t = pa.list_(pa.int32())
    assert not get_eq_func(t)([0], [1])
    assert not get_eq_func(t)(None, [1])
    assert get_eq_func(t)([1], [1])
    assert get_eq_func(t)(None, None)
Exemple #45
0
def test_is_datetime():
    assert is_datetime(pyarrow.timestamp("us", tz=None))
    assert not is_datetime(pyarrow.timestamp("ms", tz=None))
    assert not is_datetime(pyarrow.timestamp("us", tz="UTC"))
    assert not is_datetime(pyarrow.string())
Exemple #46
0
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)

Exemple #47
0
def test_cast_timestamp_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.timestamp('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)
Exemple #48
0
def test_in_expr_todo():
    import pyarrow.gandiva as gandiva
    # TODO: Implement reasonable support for timestamp, time & date.
    # Current exceptions:
    # pyarrow.lib.ArrowException: ExpressionValidationError:
    # Evaluation expression for IN clause returns XXXX values are of typeXXXX

    # binary
    arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 2]

    # timestamp
    datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
    datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
    datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)

    arr = pa.array([datetime_1, datetime_2, datetime_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # time
    time_1 = datetime_1.time()
    time_2 = datetime_2.time()
    time_3 = datetime_3.time()

    arr = pa.array([time_1, time_2, time_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # date
    date_1 = datetime_1.date()
    date_2 = datetime_2.date()
    date_3 = datetime_3.date()

    arr = pa.array([date_1, date_2, date_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [date_2], pa.date32())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]
Exemple #49
0
def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None):
    column_data = {}
    array_data = []
    schema_names = []
    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col
    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t, format=date_format))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
        elif column.type.id == pa.date32().id:
            _converted_col = map(_date_converter, _col)
            array_data.append(pa.array(_converted_col, type=pa.date32()))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int32().id:
            # PyArrow 0.8.0 can cast int64 -> int32
            _col64 = pa.array(_col, type=pa.int64())
            array_data.append(_col64.cast(pa.int32()))
        elif column.type.id == pa.bool_().id:
            _col = map(_boolean_converter, _col)
            array_data.append(pa.array(_col, type=column.type))
        else:
            array_data.append(pa.array(_col, type=column.type))
        if isinstance(field_aliases, dict):
            schema_names.append(field_aliases.get(column.name, column.name))
        else:
            schema_names.append(column.name)
    return pa.RecordBatch.from_arrays(array_data, schema_names)
Exemple #50
0
def pyarrow_timestamp():
    return pyarrow.timestamp("us", tz="UTC")
Exemple #51
0
def test_cell_is_null_timestamp():
    _assert_condition_mask(
        {"A": pa.array([datetime.datetime.now(), None], pa.timestamp("ns"))},
        CELL("is_null", "A"),
        "01",
    )
Exemple #52
0
def read_type(doc):
    t = doc[TYPE]

    if PARAM in doc:
        tp = doc[PARAM]
    else:
        tp = None

    if t == 'null':
        return pyarrow.null()

    if t == 'bool':
        return pyarrow.bool_()

    if t == 'int8':
        return pyarrow.int8()

    if t == 'int16':
        return pyarrow.int16()

    if t == 'int32':
        return pyarrow.int32()

    if t == 'int64':
        return pyarrow.int64()

    if t == 'uint8':
        return pyarrow.uint8()

    if t == 'uint16':
        return pyarrow.uint16()

    if t == 'uint32':
        return pyarrow.uint32()

    if t == 'uint64':
        return pyarrow.uint64()

    if t == 'float16':
        return pyarrow.float16()

    if t == 'float32':
        return pyarrow.float32()

    if t == 'float64':
        return pyarrow.float64()

    if t == 'date[d]':
        return pyarrow.date32()

    if t == 'date[ms]':
        return pyarrow.date64()

    if t == 'timestamp[s]':
        return pyarrow.timestamp('s')

    if t == 'timestamp[ms]':
        return pyarrow.timestamp('ms')

    if t == 'timestamp[us]':
        return pyarrow.timestamp('us')

    if t == 'timestamp[ns]':
        return pyarrow.timestamp('ns')

    if t == 'time[s]':
        return pyarrow.time32('s')

    if t == 'time[ms]':
        return pyarrow.time32('ms')

    if t == 'time[us]':
        return pyarrow.time64('us')

    if t == 'time[ns]':
        return pyarrow.time64('ns')

    if t == 'utf8':
        return pyarrow.utf8()

    if t == 'bytes':
        return pyarrow.binary()

    if t == 'factor':
        if tp is None:
            index_type = pyarrow.int32()
            dict_type = pyarrow.utf8()
        else:
            index_type = read_type(tp[INDEX])
            dict_type = read_type(tp[DICT])
        return pyarrow.dictionary(index_type, dict_type, False)

    if t == 'ordered':
        if tp is None:
            index_type = pyarrow.int32()
            dict_type = pyarrow.utf8()
        else:
            index_type = read_type(tp[INDEX])
            dict_type = read_type(tp[DICT])
        return pyarrow.dictionary(index_type, dict_type, True)

    if t == 'opaque':
        return pyarrow.binary(tp)

    if t == 'list':
        return pyarrow.list_(read_type(tp))

    if t == 'struct':
        return pyarrow.struct(
            [pyarrow.field(f[NAME], read_type(f)) for f in tp])

    raise ValueError(f'{t} is not supported BSON DataFrame type')
Exemple #53
0
    arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'),
                    type='float32')
    assert arr3.type == 'float32'
    assert arr3.null_count == 0


def test_array_from_numpy_datetimeD():
    arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')

    result = pa.array(arr)
    expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32())
    assert result.equals(expected)


@pytest.mark.parametrize(('dtype', 'type'),
                         [('datetime64[s]', pa.timestamp('s')),
                          ('datetime64[ms]', pa.timestamp('ms')),
                          ('datetime64[us]', pa.timestamp('us')),
                          ('datetime64[ns]', pa.timestamp('ns'))])
def test_array_from_numpy_datetime(dtype, type):
    data = [
        None,
        datetime.datetime(2017, 4, 4, 12, 11, 10),
        datetime.datetime(2018, 1, 1, 0, 2, 0)
    ]

    # from numpy array
    arr = pa.array(np.array(data, dtype=dtype))
    expected = pa.array(data, type=type)
    assert arr.equals(expected)
 def test_complex_as_arrow(self, arrow_cursor):
     table = arrow_cursor.execute("""
         SELECT
           col_boolean
           ,col_tinyint
           ,col_smallint
           ,col_int
           ,col_bigint
           ,col_float
           ,col_double
           ,col_string
           ,col_varchar
           ,col_timestamp
           ,CAST(col_timestamp AS time) AS col_time
           ,col_date
           ,col_binary
           ,col_array
           ,CAST(col_array AS json) AS col_array_json
           ,col_map
           ,CAST(col_map AS json) AS col_map_json
           ,col_struct
           ,col_decimal
         FROM one_row_complex
         """).as_arrow()
     assert table.shape[0] == 1
     assert table.shape[1] == 19
     assert table.schema == pa.schema([
         pa.field("col_boolean", pa.bool_()),
         pa.field("col_tinyint", pa.int8()),
         pa.field("col_smallint", pa.int16()),
         pa.field("col_int", pa.int32()),
         pa.field("col_bigint", pa.int64()),
         pa.field("col_float", pa.float32()),
         pa.field("col_double", pa.float64()),
         pa.field("col_string", pa.string()),
         pa.field("col_varchar", pa.string()),
         pa.field("col_timestamp", pa.timestamp("ms")),
         pa.field("col_time", pa.string()),
         pa.field("col_date", pa.timestamp("ms")),
         pa.field("col_binary", pa.string()),
         pa.field("col_array", pa.string()),
         pa.field("col_array_json", pa.string()),
         pa.field("col_map", pa.string()),
         pa.field("col_map_json", pa.string()),
         pa.field("col_struct", pa.string()),
         pa.field("col_decimal", pa.string()),
     ])
     assert [row for row in zip(*table.to_pydict().values())] == [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         "varchar",
         datetime(2017, 1, 1, 0, 0, 0),
         "00:00:00.000",
         datetime(2017, 1, 2, 0, 0, 0),
         "31 32 33",
         "[1, 2]",
         "[1,2]",
         "{1=2, 3=4}",
         '{"1":2,"3":4}',
         "{a=1, b=2}",
         "0.1",
     )]
Exemple #55
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.Array.from_pandas(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.Array.from_pandas(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table,
                     expected=expected,
                     version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0', flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Exemple #56
0
def pyarrow_datetime():
    return pyarrow.timestamp("us", tz=None)
Exemple #57
0
def as_column(arbitrary, nan_as_null=True):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * numba device array
    * numpy array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - NumericalColumn for all other inputs.
    """
    from . import numerical, categorical, datetime

    if isinstance(arbitrary, Column):
        if not isinstance(arbitrary, TypedColumnBase):
            # interpret as numeric
            data = arbitrary.view(numerical.NumericalColumn,
                                  dtype=arbitrary.dtype)
        else:
            data = arbitrary

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif isinstance(arbitrary, np.ndarray):
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            raise NotImplementedError("Strings are not yet supported")
        elif isinstance(arbitrary, pa.NullArray):
            pamask = Buffer(np.empty(0, dtype='int8'))
            padata = Buffer(np.empty(0,
                                     dtype=arbitrary.type.to_pandas_dtype()))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=0,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))
        elif isinstance(arbitrary, pa.DictionaryArray):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    arbitrary.indices.type.to_pandas_dtype()))
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            arbitrary = arbitrary.cast(pa.int8())
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(np.array(arbitrary.buffers()[1]).view(dtype))
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    np.dtype(arbitrary.type.to_pandas_dtype())))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.core.common.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]))

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary))

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            data = as_column(pa.array(arbitrary))

    return data
Exemple #58
0
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)

 def test_complex_unload_as_arrow(self, arrow_cursor):
     # NOT_SUPPORTED: Unsupported Hive type: time
     # NOT_SUPPORTED: Unsupported Hive type: json
     table = arrow_cursor.execute("""
         SELECT
           col_boolean
           ,col_tinyint
           ,col_smallint
           ,col_int
           ,col_bigint
           ,col_float
           ,col_double
           ,col_string
           ,col_varchar
           ,col_timestamp
           ,col_date
           ,col_binary
           ,col_array
           ,col_map
           ,col_struct
           ,col_decimal
         FROM one_row_complex
         """).as_arrow()
     assert table.shape[0] == 1
     assert table.shape[1] == 16
     assert table.schema == pa.schema([
         pa.field("col_boolean", pa.bool_()),
         pa.field("col_tinyint", pa.int32()),
         pa.field("col_smallint", pa.int32()),
         pa.field("col_int", pa.int32()),
         pa.field("col_bigint", pa.int64()),
         pa.field("col_float", pa.float32()),
         pa.field("col_double", pa.float64()),
         pa.field("col_string", pa.string()),
         pa.field("col_varchar", pa.string()),
         pa.field("col_timestamp", pa.timestamp("ns")),
         pa.field("col_date", pa.date32()),
         pa.field("col_binary", pa.binary()),
         pa.field("col_array",
                  pa.list_(pa.field("array_element", pa.int32()))),
         pa.field("col_map",
                  pa.map_(pa.int32(), pa.field("entries", pa.int32()))),
         pa.field(
             "col_struct",
             pa.struct(
                 [pa.field("a", pa.int32()),
                  pa.field("b", pa.int32())]),
         ),
         pa.field("col_decimal", pa.decimal128(10, 1)),
     ])
     assert [row for row in zip(*table.to_pydict().values())] == [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         "varchar",
         pd.Timestamp(2017, 1, 1, 0, 0, 0),
         datetime(2017, 1, 2).date(),
         b"123",
         [1, 2],
         [(1, 2), (3, 4)],
         {
             "a": 1,
             "b": 2
         },
         Decimal("0.1"),
     )]
Exemple #60
0
def test_make_column_timestamp_interpret_local_datetime_as_utc():
    column = make_column("A",
                         [datetime.datetime(2021, 4, 8, 13, 39, 1, 123456)])
    assert column.array.type == pa.timestamp("ns")  # no TZ info
    assert column.array.cast(pa.int64()) == pa.array([1617889141123456000])