Python time64 Exemples, pyarrow.time64 Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : jvm.py Projet : rok/arrow

def _from_jvm_time_type(jvm_type):
    """
    Convert a JVM time type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Time

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    if time_unit == 'SECOND':
        assert jvm_type.bitWidth == 32
        return pa.time32('s')
    elif time_unit == 'MILLISECOND':
        assert jvm_type.bitWidth == 32
        return pa.time32('ms')
    elif time_unit == 'MICROSECOND':
        assert jvm_type.bitWidth == 64
        return pa.time64('us')
    elif time_unit == 'NANOSECOND':
        assert jvm_type.bitWidth == 64
        return pa.time64('ns')

Exemple #2

0

Afficher le fichier

Fichier : test_types.py Projet : rok/arrow

def test_time64_units():
    for valid_unit in ('us', 'ns'):
        ty = pa.time64(valid_unit)
        assert ty.unit == valid_unit

    for invalid_unit in ('m', 's', 'ms'):
        error_msg = 'Invalid TimeUnit for time64: {}'.format(invalid_unit)
        with pytest.raises(ValueError, match=error_msg):
            pa.time64(invalid_unit)

Exemple #3

0

Afficher le fichier

Fichier : test_parquet.py Projet : marklavrynenko-original/arrow

def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)

Exemple #4

0

Afficher le fichier

Fichier : test_convert_pandas.py Projet : NonVolatileComputing/arrow

    def test_arrow_time_to_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356),
                   time(0, 0, 0)]

        expected = np.array(pytimes[:2] + [None])
        expected_ms = np.array([x.replace(microsecond=1000)
                                for x in pytimes[:2]] +
                               [None])
        expected_s = np.array([x.replace(microsecond=0)
                               for x in pytimes[:2]] +
                              [None])

        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        null_mask = np.array([False, False, True], dtype=bool)

        a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
        a2 = pa.array(arr * 1000, mask=null_mask,
                      type=pa.time64('ns'))

        a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
                      type=pa.time32('ms'))
        a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
                      type=pa.time32('s'))

        names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
        batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
        arr = a1.to_pandas()
        assert (arr == expected).all()

        arr = a2.to_pandas()
        assert (arr == expected).all()

        arr = a3.to_pandas()
        assert (arr == expected_ms).all()

        arr = a4.to_pandas()
        assert (arr == expected_s).all()

        df = batch.to_pandas()
        expected_df = pd.DataFrame({'time64[us]': expected,
                                    'time64[ns]': expected,
                                    'time32[ms]': expected_ms,
                                    'time32[s]': expected_s},
                                   columns=names)

        tm.assert_frame_equal(df, expected_df)

Exemple #5

0

Afficher le fichier

Fichier : test_array.py Projet : CodingCat/arrow

def test_cast_time64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.time64('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)

Exemple #6

0

Afficher le fichier

Fichier : test_schema.py Projet : NonVolatileComputing/arrow

def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped

Exemple #7

0

Afficher le fichier

Fichier : test_schema.py Projet : giantwhale/arrow

def test_time_types():
    t1 = pa.time32('s')
    t2 = pa.time32('ms')
    t3 = pa.time64('us')
    t4 = pa.time64('ns')

    assert t1.unit == 's'
    assert t2.unit == 'ms'
    assert t3.unit == 'us'
    assert t4.unit == 'ns'

    assert str(t1) == 'time32[s]'
    assert str(t4) == 'time64[ns]'

    with pytest.raises(ValueError):
        pa.time32('us')

    with pytest.raises(ValueError):
        pa.time64('s')

Exemple #8

0

Afficher le fichier

Fichier : test_schema.py Projet : giantwhale/arrow

def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected

Exemple #9

0

Afficher le fichier

Fichier : test_convert_pandas.py Projet : NonVolatileComputing/arrow

    def test_pytime_from_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356)]

        # microseconds
        t1 = pa.time64('us')

        aobjs = np.array(pytimes + [None], dtype=object)
        parr = pa.array(aobjs)
        assert parr.type == t1
        assert parr[0].as_py() == pytimes[0]
        assert parr[1].as_py() == pytimes[1]
        assert parr[2] is pa.NA

        # DataFrame
        df = pd.DataFrame({'times': aobjs})
        batch = pa.RecordBatch.from_pandas(df)
        assert batch[0].equals(parr)

        # Test ndarray of int64 values
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        a1 = pa.array(arr, type=pa.time64('us'))
        assert a1[0].as_py() == pytimes[0]

        a2 = pa.array(arr * 1000, type=pa.time64('ns'))
        assert a2[0].as_py() == pytimes[0]

        a3 = pa.array((arr / 1000).astype('i4'),
                      type=pa.time32('ms'))
        assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)

        a4 = pa.array((arr / 1000000).astype('i4'),
                      type=pa.time32('s'))
        assert a4[0].as_py() == pytimes[0].replace(microsecond=0)

Exemple #10

0

Afficher le fichier

Fichier : test_types.py Projet : giantwhale/arrow

def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i

Exemple #11

0

Afficher le fichier

Fichier : test_types.py Projet : rok/arrow

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )

Exemple #12

0

Afficher le fichier

Fichier : test_types.py Projet : giantwhale/arrow

def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())

Exemple #13

0

Afficher le fichier

Fichier : test_types.py Projet : giantwhale/arrow

                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)

Exemple #14

0

Afficher le fichier

Fichier : strategies.py Projet : emkornfield/arrow

])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(
    null_type,
    bool_type,
    binary_type,
    string_type,
    numeric_types,

Exemple #15

0

Afficher le fichier

Fichier : test_jvm.py Projet : emkornfield/arrow

 (pa.null(), '{"name":"null"}'),
 (pa.bool_(), '{"name":"bool"}'),
 (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
 (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
 (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
 (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
 (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
 (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
 (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
 (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
 (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
 (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
 (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
 (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
 (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
 (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
     '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
     '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
     ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
     '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
 (pa.date32(), '{"name":"date","unit":"DAY"}'),
 (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),

Exemple #16

0

Afficher le fichier

Fichier : test_parquet.py Projet : zhouyonglong/Apache-Arrow

def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.Array.from_pandas(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.Array.from_pandas(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)

Exemple #17

0

Afficher le fichier

    ARROW_SCALAR_IDS_TO_BQ = {
        # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
        pyarrow.bool_().id: "BOOL",
        pyarrow.int8().id: "INT64",
        pyarrow.int16().id: "INT64",
        pyarrow.int32().id: "INT64",
        pyarrow.int64().id: "INT64",
        pyarrow.uint8().id: "INT64",
        pyarrow.uint16().id: "INT64",
        pyarrow.uint32().id: "INT64",
        pyarrow.uint64().id: "INT64",
        pyarrow.float16().id: "FLOAT64",
        pyarrow.float32().id: "FLOAT64",
        pyarrow.float64().id: "FLOAT64",
        pyarrow.time32("ms").id: "TIME",
        pyarrow.time64("ns").id: "TIME",
        pyarrow.timestamp("ns").id: "TIMESTAMP",
        pyarrow.date32().id: "DATE",
        pyarrow.date64().id: "DATETIME",  # because millisecond resolution
        pyarrow.binary().id: "BYTES",
        pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id: "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER

Exemple #18

0

Afficher le fichier

Fichier : pyarrow.py Projet : tamasgal/awkward-1.0

    )

    # order is important; _string_like[:2] vs _string_like[::2]
    _string_like = (
        pyarrow.string(),
        pyarrow.large_string(),
        pyarrow.binary(),
        pyarrow.large_binary(),
    )

    _pyarrow_to_numpy_dtype = {
        pyarrow.date32(): (True, np.dtype("M8[D]")),
        pyarrow.date64(): (False, np.dtype("M8[ms]")),
        pyarrow.time32("s"): (True, np.dtype("M8[s]")),
        pyarrow.time32("ms"): (True, np.dtype("M8[ms]")),
        pyarrow.time64("us"): (False, np.dtype("M8[us]")),
        pyarrow.time64("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.timestamp("s"): (False, np.dtype("M8[s]")),
        pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")),
        pyarrow.timestamp("us"): (False, np.dtype("M8[us]")),
        pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.duration("s"): (False, np.dtype("m8[s]")),
        pyarrow.duration("ms"): (False, np.dtype("m8[ms]")),
        pyarrow.duration("us"): (False, np.dtype("m8[us]")),
        pyarrow.duration("ns"): (False, np.dtype("m8[ns]")),
    }

if not ak._v2._util.numpy_at_least("1.17.0"):

    def packbits(bytearray, lsb_order=True):
        if lsb_order:

Exemple #19

0

Afficher le fichier

def test_in_expr_todo():
    import pyarrow.gandiva as gandiva
    # TODO: Implement reasonable support for timestamp, time & date.
    # Current exceptions:
    # pyarrow.lib.ArrowException: ExpressionValidationError:
    # Evaluation expression for IN clause returns XXXX values are of typeXXXX

    # binary
    arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array([1, 2], type=pa.uint32()))

    # timestamp
    datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
    datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
    datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)

    arr = pa.array([datetime_1, datetime_2, datetime_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # time
    time_1 = datetime_1.time()
    time_2 = datetime_2.time()
    time_3 = datetime_3.time()

    arr = pa.array([time_1, time_2, time_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # date
    date_1 = datetime_1.date()
    date_2 = datetime_2.date()
    date_3 = datetime_3.date()

    arr = pa.array([date_1, date_2, date_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [date_2], pa.date32())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

Exemple #20

0

Afficher le fichier

Fichier : _pandas_helpers.py Projet : GoogleCloudPlatform/gcloud-python

def pyarrow_time():
    return pyarrow.time64("us")

Exemple #21

0

Afficher le fichier

 (pa.bool_(), '{"name":"bool"}'),
 (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
 (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
 (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
 (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
 (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
 (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
 (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
 (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
 (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
 (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
 (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
 (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
 (pa.time32('ms'),
  '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
 (pa.time64('us'),
  '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
  '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
  '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
  '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
  '"timezone":null}'),
 (pa.timestamp('ns',
               tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
  ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
  '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),

Exemple #22

0

Afficher le fichier

 "INT64",
 pyarrow.uint16().id:
 "INT64",
 pyarrow.uint32().id:
 "INT64",
 pyarrow.uint64().id:
 "INT64",
 pyarrow.float16().id:
 "FLOAT64",
 pyarrow.float32().id:
 "FLOAT64",
 pyarrow.float64().id:
 "FLOAT64",
 pyarrow.time32("ms").id:
 "TIME",
 pyarrow.time64("ns").id:
 "TIME",
 pyarrow.timestamp("ns").id:
 "TIMESTAMP",
 pyarrow.date32().id:
 "DATE",
 pyarrow.date64().id:
 "DATETIME",  # because millisecond resolution
 pyarrow.binary().id:
 "BYTES",
 pyarrow.string().id:
 "STRING",  # also alias for pyarrow.utf8()
 pyarrow.decimal128(38, scale=9).id:
 "NUMERIC",
 # The exact decimal's scale and precision are not important, as only
 # the type ID matters, and it's the same for all decimal128 instances.

Exemple #23

0

Afficher le fichier

Fichier : pandas_examples.py Projet : emkornfield/arrow

def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    date_data = [
        [],
        [date(2018, 1, 1), date(2032, 12, 30)],
        [date(2000, 6, 7)],
        None,
        [date(1969, 6, 9), date(1972, 7, 3)]
    ]
    time_data = [
        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
        [],
        [time(22, 5, 59)],
        None,
        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
    ]

    temporal_pairs = [
        (pa.date32(), date_data),
        (pa.date64(), date_data),
        (pa.time32('s'), time_data),
        (pa.time32('ms'), time_data),
        (pa.time64('us'), time_data)
    ]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema

Exemple #24

0

Afficher le fichier

def test_cast_time64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.time64('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)

Exemple #25

0

Afficher le fichier

    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)
    arr2 = pa.array(arr)  # noqa
    assert sys.getrefcount(arr) == (refcount + 1)

Exemple #26

0

Afficher le fichier

Fichier : test_array.py Projet : CodingCat/arrow

        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)

Exemple #27

0

Afficher le fichier

unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=1, max_value=38),
                         scale=st.integers(min_value=1, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.builds(pa.timestamp,
                            unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
                            tz=tzst.timezones())
duration_types = st.builds(pa.duration,
                           st.sampled_from(['s', 'ms', 'us', 'ns']))
temporal_types = st.one_of(date_types, time_types, timestamp_types,
                           duration_types)

primitive_types = st.one_of(null_type, bool_type, binary_type, string_type,
                            large_binary_type, large_string_type,
                            numeric_types, temporal_types)

metadata = st.dictionaries(st.text(), st.text())

Exemple #28

0

Afficher le fichier

Fichier : test_types.py Projet : thedrow/arrow-1

# specific language governing permissions and limitations
# under the License.

import pickle

import pytest

import pyarrow as pa
import pyarrow.types as types

MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int8()),

Exemple #29

0

Afficher le fichier

def test_date_time_types(tempdir):
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    t0 = pa.timestamp('ms')
    data0 = np.arange(4, dtype='int64')
    a0 = pa.array(data0, type=t0)

    t1 = pa.timestamp('us')
    data1 = np.arange(4, dtype='int64')
    a1 = pa.array(data1, type=t1)

    t2 = pa.timestamp('ns')
    data2 = np.arange(4, dtype='int64')
    a2 = pa.array(data2, type=t2)

    table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]'])
    expected = pa.Table.from_arrays([a0, a1, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int64 for all timestamps supported by default
    filename = tempdir / 'int64_timestamps.parquet'
    _write_table(table, filename, version='2.0')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT64'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    t0_ns = pa.timestamp('ns')
    data0_ns = np.array(data0 * 1000000, dtype='int64')
    a0_ns = pa.array(data0_ns, type=t0_ns)

    t1_ns = pa.timestamp('ns')
    data1_ns = np.array(data1 * 1000, dtype='int64')
    a1_ns = pa.array(data1_ns, type=t1_ns)

    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int96 nanosecond timestamps produced upon request
    filename = tempdir / 'explicit_int96_timestamps.parquet'
    _write_table(table,
                 filename,
                 version='2.0',
                 use_deprecated_int96_timestamps=True)
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    # int96 nanosecond timestamps implied by flavor 'spark'
    filename = tempdir / 'spark_int96_timestamps.parquet'
    _write_table(table, filename, version='2.0', flavor='spark')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

Exemple #30

0

Afficher le fichier

Fichier : test_parquet.py Projet : NonVolatileComputing/arrow

def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)

Exemple #31

0

Afficher le fichier

def main():
    # https://arrow.apache.org/docs/python/api/datatypes.html
    my_schema = pa.schema([
        # skip null

        ('c_bool', pa.bool_()),

        ('c_int8', pa.int8()),
        ('c_int16', pa.int16()),
        ('c_int32', pa.int32()),
        ('c_int64', pa.int64()),

        ('c_uint8', pa.uint8()),
        ('c_uint16', pa.uint16()),
        ('c_uint32', pa.uint32()),
        ('c_uint64', pa.uint64()),

        # skip ('c_float16', pa.float16()),
        ('c_float32', pa.float32()),
        ('c_float64', pa.float64()),

        ('c_time32', pa.time32('ms')),
        ('c_time64', pa.time64('ns')),
        ('c_timestamp', pa.timestamp('ms')),
        ('c_date32', pa.date32()),
        ('c_date64', pa.date64()),

        # skip binary

        ('c_string', pa.string()),

        # skip utf8
        # skip large_binary
        # skip large_string
        # skip large_utf8

        ('c_decimal128_8_3', pa.decimal128(8, 3))

        # skip list_
        # skip  large_list
        # skip struct
        # skip dictionary
        # skip field
        # skip schema
        # skip from_numpy_dtype
    ])

    c_bool = pa.array([False, True, False], type=pa.bool_())

    c_int8 = pa.array([1, 2, 3], type=pa.int8())
    c_int16 = pa.array([1, 2, 3], type=pa.int16())
    c_int32 = pa.array([1, 2, 3], type=pa.int32())
    c_int64 = pa.array([1, 2, 3], type=pa.int64())

    c_uint8 = pa.array([1, 2, 3], type=pa.uint8())
    c_uint16 = pa.array([1, 2, 3], type=pa.uint16())
    c_uint32 = pa.array([1, 2, 3], type=pa.uint32())
    c_uint64 = pa.array([1, 2, 3], type=pa.uint64())

    # c_float16 = pa.array([np.float16(1.0), np.float16(2.0), np.float16(3.0)], type=pa.float16())
    c_float32 = pa.array([1.0, 2.0, 3.0], type=pa.float32())
    c_float64 = pa.array([1.0, 2.0, 3.0], type=pa.float64())

    c_time32 = pa.array([1, 2, 3], type=pa.time32('ms'))
    c_time64 = pa.array([1, 2, 3], type=pa.time64('ns'))
    c_timestamp = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.timestamp('ms'))
    c_date32 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date32())
    c_date64 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date64())

    c_string = pa.array(
        ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
        type=pa.string()
    )

    c_decimal128_8_3 = pa.array([1, 2, 3], type=pa.decimal128(8, 3))

    batch = pa.RecordBatch.from_arrays(
        [c_bool,
         c_int8, c_int16, c_int32, c_int64,
         c_uint8, c_uint16, c_uint32, c_uint64,
         # c_float16,
         c_float32, c_float64,
         c_time32, c_time64, c_timestamp, c_date32, c_date64,
         c_string,
         c_decimal128_8_3
         ],
        schema=my_schema
    )

    table = pa.Table.from_batches([batch])
    pq.write_table(table, 'example.parquet')

Exemple #32

0

Afficher le fichier

Fichier : __init__.py Projet : tnir/pandas

        pa.uint8(), pa.uint16(),
        pa.uint32(), pa.uint64()
    ]
    SIGNED_INT_PYARROW_DTYPES = [
        pa.uint8(), pa.int16(),
        pa.int32(), pa.uint64()
    ]
    ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES

    FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
    STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()]

    TIME_PYARROW_DTYPES = [
        pa.time32("s"),
        pa.time32("ms"),
        pa.time64("us"),
        pa.time64("ns"),
    ]
    DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
    DATETIME_PYARROW_DTYPES = [
        pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"]
        for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
    ]
    TIMEDELTA_PYARROW_DTYPES = [
        pa.duration(unit) for unit in ["s", "ms", "us", "ns"]
    ]

    BOOL_PYARROW_DTYPES = [pa.bool_()]

    # TODO: Add container like pyarrow types:
    #  https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions

Exemple #33

0

Afficher le fichier

Fichier : test_arrow_converter.py Projet : uk-gov-mirror/moj-analytical-services.mojap-metadata

 ("int8", pa.int8()),
 ("int16", pa.int16()),
 ("int32", pa.int32()),
 ("int64", pa.int64()),
 ("uint8", pa.uint8()),
 ("uint16", pa.uint16()),
 ("uint32", pa.uint32()),
 ("uint64", pa.uint64()),
 ("float16", pa.float16()),
 ("float32", pa.float32()),
 ("float64", pa.float64()),
 ("decimal128(38,1)", pa.decimal128(38, 1)),
 ("decimal128(1,2)", pa.decimal128(1, 2)),
 ("time32(s)", pa.time32("s")),
 ("time32(ms)", pa.time32("ms")),
 ("time64(us)", pa.time64("us")),
 ("time64(ns)", pa.time64("ns")),
 ("timestamp(s)", pa.timestamp("s")),
 ("timestamp(ms)", pa.timestamp("ms")),
 ("timestamp(us)", pa.timestamp("us")),
 ("timestamp(ns)", pa.timestamp("ns")),
 ("date32", pa.date32()),
 ("date64", pa.date64()),
 ("string", pa.string()),
 ("large_string", pa.large_string()),
 ("utf8", pa.utf8()),
 ("large_utf8", pa.large_utf8()),
 ("binary", pa.binary()),
 ("binary(128)", pa.binary(128)),
 ("large_binary", pa.large_binary()),
 ("struct<num:int64>", pa.struct([("num", pa.int64())])),

Exemple #34

0

Afficher le fichier

def pyarrow_time():
    return pyarrow.time64("us")