Python timestamp Exemples, pyarrow.timestamp Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_convert_builtin.py Projet : CodingCat/arrow

def test_sequence_timestamp_from_int_with_unit():
    data = [1]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"

    with pytest.raises(pa.ArrowException):
        class CustomClass():
            pass
        pa.array([1, CustomClass()], type=ns)
        pa.array([1, CustomClass()], type=pa.date32())
        pa.array([1, CustomClass()], type=pa.date64())

Exemple #2

0

Afficher le fichier

Fichier : test_convert_pandas.py Projet : marklavrynenko-original/arrow

    def test_timestamps_notimezone_nulls(self):
        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123',
                None,
                '2010-08-13T05:46:57.437'],
                dtype='datetime64[ms]')
            })
        field = pa.field('datetime64', pa.timestamp('ms'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=True,
            expected_schema=schema,
        )

        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123456789',
                None,
                '2010-08-13T05:46:57.437699912'],
                dtype='datetime64[ns]')
            })
        field = pa.field('datetime64', pa.timestamp('ns'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=False,
            expected_schema=schema,
        )

Exemple #3

0

Afficher le fichier

Fichier : test_convert_builtin.py Projet : dremio/arrow

def test_sequence_timestamp_with_unit():
    data = [
        datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

Exemple #4

0

Afficher le fichier

Fichier : test_schema.py Projet : NonVolatileComputing/arrow

def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped

Exemple #5

0

Afficher le fichier

Fichier : test_array.py Projet : CodingCat/arrow

def test_cast_timestamp_unit():
    # ARROW-1680
    val = datetime.datetime.now()
    s = pd.Series([val])
    s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York')

    us_with_tz = pa.timestamp('us', tz='America/New_York')

    arr = pa.Array.from_pandas(s_nyc, type=us_with_tz)

    # ARROW-1906
    assert arr.type == us_with_tz

    arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us'))

    assert arr[0].as_py() == s_nyc[0]
    assert arr2[0].as_py() == s[0]

    # Disallow truncation
    arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
    expected = pa.array([123], type='int64').cast(pa.timestamp('s'))

    target = pa.timestamp('s')
    with pytest.raises(ValueError):
        arr.cast(target)

    result = arr.cast(target, safe=False)
    assert result.equals(expected)

Exemple #6

0

Afficher le fichier

Fichier : test_array.py Projet : rok/arrow

def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)

Exemple #7

0

Afficher le fichier

Fichier : test_types.py Projet : rok/arrow

def test_timestamp():
    for unit in ('s', 'ms', 'us', 'ns'):
        for tz in (None, 'UTC', 'Europe/Paris'):
            ty = pa.timestamp(unit, tz=tz)
            assert ty.unit == unit
            assert ty.tz == tz

    for invalid_unit in ('m', 'arbit', 'rary'):
        with pytest.raises(ValueError, match='Invalid TimeUnit string'):
            pa.timestamp(invalid_unit)

Exemple #8

0

Afficher le fichier

Fichier : test_schema.py Projet : giantwhale/arrow

def test_type_from_numpy_dtype_timestamps():
    cases = [
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

Exemple #9

0

Afficher le fichier

Fichier : test_array.py Projet : CodingCat/arrow

def test_cast_timestamp_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.timestamp('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)

Exemple #10

0

Afficher le fichier

Fichier : test_scalars.py Projet : emkornfield/arrow

    def test_timestamp(self):
        import pandas as pd
        arr = pd.date_range('2000-01-01 12:34:56', periods=10).values

        units = ['ns', 'us', 'ms', 's']

        for i, unit in enumerate(units):
            dtype = 'datetime64[{0}]'.format(unit)
            arrow_arr = pa.Array.from_pandas(arr.astype(dtype))
            expected = pd.Timestamp('2000-01-01 12:34:56')

            assert arrow_arr[0].as_py() == expected
            assert arrow_arr[0].value * 1000**i == expected.value

            tz = 'America/New_York'
            arrow_type = pa.timestamp(unit, tz=tz)

            dtype = 'datetime64[{0}]'.format(unit)
            arrow_arr = pa.Array.from_pandas(arr.astype(dtype),
                                             type=arrow_type)
            expected = (pd.Timestamp('2000-01-01 12:34:56')
                        .tz_localize('utc')
                        .tz_convert(tz))

            assert arrow_arr[0].as_py() == expected
            assert arrow_arr[0].value * 1000**i == expected.value

Exemple #11

0

Afficher le fichier

Fichier : test_schema.py Projet : giantwhale/arrow

def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type

Exemple #12

0

Afficher le fichier

Fichier : pandas_compat.py Projet : NonVolatileComputing/arrow

def _add_any_metadata(table, pandas_metadata):
    modified_columns = {}

    schema = table.schema

    # Add time zones
    for i, col_meta in enumerate(pandas_metadata['columns']):
        if col_meta['pandas_type'] == 'datetimetz':
            col = table[i]
            converted = col.to_pandas()
            tz = col_meta['metadata']['timezone']
            tz_aware_type = pa.timestamp('ns', tz=tz)
            with_metadata = pa.Array.from_pandas(converted.values,
                                                 type=tz_aware_type)

            field = pa.field(schema[i].name, tz_aware_type)
            modified_columns[i] = pa.Column.from_array(field,
                                                       with_metadata)

    if len(modified_columns) > 0:
        columns = []
        for i in range(len(table.schema)):
            if i in modified_columns:
                columns.append(modified_columns[i])
            else:
                columns.append(table[i])
        return pa.Table.from_arrays(columns)
    else:
        return table

Exemple #13

0

Afficher le fichier

Fichier : test_convert_builtin.py Projet : dremio/arrow

def test_datetime_subclassing():
    class MyDate(datetime.date):
        pass
    data = [
        MyDate(2007, 7, 13),
    ]
    date_type = pa.date32()
    arr_date = pa.array(data, type=date_type)
    assert len(arr_date) == 1
    assert arr_date.type == date_type
    assert arr_date[0].as_py() == datetime.date(2007, 7, 13)

    class MyDatetime(datetime.datetime):
        pass

    data = [
        MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

Exemple #14

0

Afficher le fichier

Fichier : test_array.py Projet : dremio/arrow

def test_timestamp_units_from_list(unit):
    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
    a1 = pa.array([x])
    a2 = pa.array([x], type=pa.timestamp(unit))

    assert a1.type == a2.type
    assert a1.type.unit == unit
    assert a1[0] == a2[0]

Exemple #15

0

Afficher le fichier

Fichier : pandas_examples.py Projet : NonVolatileComputing/arrow

def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema

Exemple #16

0

Afficher le fichier

Fichier : test_csv.py Projet : wesm/arrow

 def test_simple_timestamps(self):
     # Infer a timestamp column
     rows = b"a,b\n1970,1970-01-01\n1989,1989-07-14\n"
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.timestamp('s'))])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1970, 1989],
         'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)],
         }

Exemple #17

0

Afficher le fichier

Fichier : test_array.py Projet : emkornfield/arrow

def test_array_from_list_of_timestamps(unit):
    n = np.datetime64('NaT', unit)
    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
    y = np.datetime64('2018-11-22 12:24:48.111111111', unit)

    a1 = pa.array([n, x, y])
    a2 = pa.array([n, x, y], type=pa.timestamp(unit))

    assert a1.type == a2.type
    assert a1.type.unit == unit
    assert a1[0] == a2[0]

Exemple #18

0

Afficher le fichier

Fichier : test_parquet.py Projet : marklavrynenko-original/arrow

def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)

Exemple #19

0

Afficher le fichier

Fichier : test_schema.py Projet : giantwhale/arrow

def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected

Exemple #20

0

Afficher le fichier

Fichier : jvm.py Projet : rok/arrow

def _from_jvm_timestamp_type(jvm_type):
    """
    Convert a JVM timestamp type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Timestamp

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    timezone = jvm_type.getTimezone()
    if time_unit == 'SECOND':
        return pa.timestamp('s', tz=timezone)
    elif time_unit == 'MILLISECOND':
        return pa.timestamp('ms', tz=timezone)
    elif time_unit == 'MICROSECOND':
        return pa.timestamp('us', tz=timezone)
    elif time_unit == 'NANOSECOND':
        return pa.timestamp('ns', tz=timezone)

Exemple #21

0

Afficher le fichier

Fichier : test_convert_pandas.py Projet : NonVolatileComputing/arrow

 def test_timestamps_notimezone_no_nulls(self):
     df = pd.DataFrame({
         'datetime64': np.array([
             '2007-07-13T01:23:34.123456789',
             '2006-01-13T12:34:56.432539784',
             '2010-08-13T05:46:57.437699912'],
             dtype='datetime64[ns]')
     })
     field = pa.field('datetime64', pa.timestamp('ns'))
     schema = pa.schema([field])
     self._check_pandas_roundtrip(
         df,
         expected_schema=schema,
     )

Exemple #22

0

Afficher le fichier

Fichier : pandas_compat.py Projet : laurentgo/arrow

def get_datetimetz_type(values, dtype, type_):
    if values.dtype.type != np.datetime64:
        return values, type_

    if _pandas_api.is_datetimetz(dtype) and type_ is None:
        # If no user type passed, construct a tz-aware timestamp type
        tz = dtype.tz
        unit = dtype.unit
        type_ = pa.timestamp(unit, tz)
    elif type_ is None:
        # Trust the NumPy dtype
        type_ = pa.from_numpy_dtype(values.dtype)

    return values, type_

Exemple #23

0

Afficher le fichier

Fichier : test_types.py Projet : rok/arrow

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )

Exemple #24

0

Afficher le fichier

Fichier : test_convert_builtin.py Projet : apache/arrow

 def test_timestamp(self):
     data = [
         datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
         None,
         datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
         datetime.datetime(2010, 8, 13, 5, 46, 57, 437699),
     ]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.type == pyarrow.timestamp()
     assert arr.null_count == 1
     assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
     assert arr[1].as_py() is None
     assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
     assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)

Exemple #25

0

Afficher le fichier

Fichier : test_convert_pandas.py Projet : StevenMPhillips/arrow

    def test_timestamps_notimezone_no_nulls(self):
        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123',
                '2006-01-13T12:34:56.432',
                '2010-08-13T05:46:57.437'],
                dtype='datetime64[ms]')
            })
        field = A.Field.from_py('datetime64', A.timestamp('ms'))
        schema = A.Schema.from_fields([field])
        self._check_pandas_roundtrip(df, timestamps_to_ms=True,
                                     expected_schema=schema)

        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123456789',
                '2006-01-13T12:34:56.432539784',
                '2010-08-13T05:46:57.437699912'],
                dtype='datetime64[ns]')
            })
        field = A.Field.from_py('datetime64', A.timestamp('ns'))
        schema = A.Schema.from_fields([field])
        self._check_pandas_roundtrip(df, timestamps_to_ms=False,
                                     expected_schema=schema)

Exemple #26

0

Afficher le fichier

Fichier : pandas_compat.py Projet : NonVolatileComputing/arrow

def get_datetimetz_type(values, dtype, type_):
    from pyarrow.compat import DatetimeTZDtype

    if values.dtype.type != np.datetime64:
        return values, type_

    if isinstance(dtype, DatetimeTZDtype):
        tz = dtype.tz
        unit = dtype.unit
        type_ = pa.timestamp(unit, tz)
    elif type_ is None:
        # Trust the NumPy dtype
        type_ = pa.from_numpy_dtype(values.dtype)

    return values, type_

Exemple #27

0

Afficher le fichier

Fichier : test_schema.py Projet : sunchao/arrow

def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')

Exemple #28

0

Afficher le fichier

Fichier : pandas_compat.py Projet : dremio/arrow

def _add_any_metadata(table, pandas_metadata):
    modified_columns = {}

    schema = table.schema

    index_columns = pandas_metadata['index_columns']
    n_index_levels = len(index_columns)
    n_columns = len(pandas_metadata['columns']) - n_index_levels

    # Add time zones
    for i, col_meta in enumerate(pandas_metadata['columns']):

        raw_name = col_meta.get('field_name')
        if not raw_name:
            # deal with metadata written with arrow < 0.8
            raw_name = col_meta['name']
            if i >= n_columns:
                # index columns
                raw_name = index_columns[i - n_columns]
            if raw_name is None:
                raw_name = 'None'

        idx = schema.get_field_index(raw_name)
        if idx != -1:
            if col_meta['pandas_type'] == 'datetimetz':
                col = table[idx]
                converted = col.to_pandas()
                tz = col_meta['metadata']['timezone']
                tz_aware_type = pa.timestamp('ns', tz=tz)
                with_metadata = pa.Array.from_pandas(converted.values,
                                                     type=tz_aware_type)

                field = pa.field(schema[idx].name, tz_aware_type)
                modified_columns[idx] = pa.Column.from_array(field,
                                                             with_metadata)

    if len(modified_columns) > 0:
        columns = []
        for i in range(len(table.schema)):
            if i in modified_columns:
                columns.append(modified_columns[i])
            else:
                columns.append(table[i])
        return pa.Table.from_arrays(columns)
    else:
        return table

Exemple #29

0

Afficher le fichier

Fichier : test_convert_builtin.py Projet : dremio/arrow

def test_sequence_numpy_timestamp():
    data = [
        np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
        None,
        np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
        np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
    ]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.timestamp('us')
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                               23, 34, 123456)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
                                               34, 56, 432539)
    assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
                                               46, 57, 437699)

Exemple #30

0

Afficher le fichier

Fichier : test_parquet.py Projet : NonVolatileComputing/arrow

def test_coerce_timestamps(tmpdir):
    from collections import OrderedDict
    # ARROW-622
    arrays = OrderedDict()
    fields = [pa.field('datetime64',
                       pa.list_(pa.timestamp('ms')))]
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, schema=schema)

    _write_table(arrow_table, filename.strpath, version="2.0",
                 coerce_timestamps='us')
    table_read = _read_table(filename.strpath)
    df_read = table_read.to_pandas()

    df_expected = df.copy()
    for i, x in enumerate(df_expected['datetime64']):
        if isinstance(x, np.ndarray):
            df_expected['datetime64'][i] = x.astype('M8[us]')

    tm.assert_frame_equal(df_expected, df_read)

    with pytest.raises(ValueError):
        _write_table(arrow_table, filename.strpath, version="2.0",
                     coerce_timestamps='unknown')

Exemple #31

0

Afficher le fichier

Fichier : publish_parq.py Projet : IntegriChain1/s3parq

def _parquet_schema(dataframe: pd.DataFrame,
                    custom_redshift_columns: dict = None):
    """ Translates pandas dtypes to PyArrow types and creates a Schema from them

    Args:
        dataframe (pd.DataFrame): Dataframe to pull the schema of
        custom_redshift_columns (dict, Optional): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)

    Returns:
        PyArrow Schema of the given dataframe
        Potentially modified Dataframe
    """
    fields = []
    for col, dtype in dataframe.dtypes.items():
        dtype = dtype.name
        if dtype == 'object':
            if custom_redshift_columns:
                # Detect if the Pandas object column contains Python decimal objects.
                if "[Decimal(" in str(dataframe[col].values)[:9]:
                    # If Python decimal objects are present, parse out the precision and scale
                    # from the custom_redshift_columns dictionary to use when converting
                    # to PyArrow's decimal128 data type.
                    s = custom_redshift_columns[col]
                    precision = int(s[s.find('DECIMAL(') +
                                      len('DECIMAL('):s.rfind(',')].strip())
                    scale = int(s[s.find(',') + len(','):s.rfind(')')].strip())
                    pa_type = pa.decimal128(precision=precision, scale=scale)
                else:
                    pa_type = pa.string()
            else:
                pa_type = pa.string()
        elif dtype.startswith('int32'):
            pa_type = pa.int32()
        elif dtype.startswith('int64'):
            pa_type = pa.int64()
        elif dtype.startswith('int8'):
            pa_type = pa.int8()
        elif dtype.startswith('Int32'):
            dataframe = dataframe.astype({col: 'object'})
            pa_type = pa.int32()
        elif dtype.startswith('Int64'):
            dataframe = dataframe.astype({col: 'object'})
            pa_type = pa.int64()
        elif dtype.startswith('float32'):
            pa_type = pa.float32()
        elif dtype.startswith('float64'):
            pa_type = pa.float64()
        elif dtype.startswith('float16'):
            pa_type = pa.float16()
        elif dtype.startswith('datetime'):
            pa_type = pa.timestamp('ns')
        elif dtype.startswith('date'):
            pa_type = pa.date64()
        elif dtype.startswith('category'):
            pa_type = pa.string()
        elif dtype == 'bool':
            pa_type = pa.bool_()
        else:
            raise NotImplementedError(
                f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq."
            )
        fields.append(pa.field(col, pa_type))

    return (pa.schema(fields=fields), dataframe)

Exemple #32

0

Afficher le fichier

Fichier : test_schema.py Projet : xiaoyi-db/arrow

def test_type_timestamp_with_tz():
    tz = 'America/Los_Angeles'
    t = pa.timestamp('ns', tz=tz)
    assert t.unit == 'ns'
    assert t.tz == tz

Exemple #33

0

Afficher le fichier

Fichier : strategies.py Projet : ivanyu/arrow

integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=0, max_value=38),
                         scale=st.integers(min_value=0, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.sampled_from([
    pa.timestamp('s'),
    pa.timestamp('ms'),
    pa.timestamp('us'),
    pa.timestamp('ns')
])
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(null_type, bool_type, binary_type, string_type,
                            numeric_types, temporal_types)

metadata = st.dictionaries(st.text(), st.text())


@st.defines_strategy
def fields(type_strategy=primitive_types):
    return st.builds(pa.field,

Exemple #34

0

Afficher le fichier

Fichier : pa_pd.py Projet : uk-gov-mirror/moj-analytical-services.mojap-arrow-pd-parser

def generate_type_mapper(
    pd_boolean=None,
    pd_integer=None,
    pd_string=None,
    pd_date_type=None,
    pd_timestamp_type=None,
):
    """Specifies the pyarrow data types mapping to corresponding Pandas data types.

    Args:
        pd_boolean: if not noe, use the new Pandas bool type. Defaults to None.
        pd_integer: if not None, use the new Pandas nullable integer type rather than
            defaulting to floats. Defaults to None.
        pd_string: if not None, use the new Pandas str type. Defaults to None.
        pd_date_type: Defaults to None.
        pd_timestamp_type: Defaults to None.

    Returns:
        Type mappings between pyarrow and pandas data types.
    """
    tm = {}
    if pd_boolean:
        bool_map = {pa.bool_(): pd.BooleanDtype()}
        tm = {**tm, **bool_map}
    if pd_string:
        string_map = {pa.string(): pd.StringDtype()}
        tm = {**tm, **string_map}

    if pd_integer:
        int_map = {
            pa.int8(): pd.Int64Dtype(),
            pa.int16(): pd.Int64Dtype(),
            pa.int32(): pd.Int64Dtype(),
            pa.int64(): pd.Int64Dtype(),
            pa.uint8(): pd.Int64Dtype(),
            pa.uint16(): pd.Int64Dtype(),
            pa.uint32(): pd.Int64Dtype(),
            pa.uint64(): pd.Int64Dtype(),
        }
        tm = {**tm, **int_map}
    else:
        # No brackets for either keys or values in this dictionary
        # This lets types_mapper understand the numpy data type
        float_map = {
            pa.int8: np.float64,
            pa.int16: np.float64,
            pa.int32: np.float64,
            pa.int64: np.float64,
            pa.uint8: np.float64,
            pa.uint16: np.float64,
            pa.uint32: np.float64,
            pa.uint64: np.float64,
        }
        tm = {**tm, **float_map}

    if pd_date_type == "pd_period":
        date_map = {pa.date64(): pd.PeriodDtype("ms")}
        tm = {**tm, **date_map}

    if pd_timestamp_type == "pd_period":
        datetime_map = {
            pa.timestamp("s"): pd.PeriodDtype("s"),
            pa.timestamp("ms"): pd.PeriodDtype("ms"),
            pa.timestamp("us"): pd.PeriodDtype("us"),
            pa.timestamp("ns"): pd.PeriodDtype("ns"),
        }
        tm = {**tm, **datetime_map}
    if tm:
        return tm.get
    else:
        return None

Exemple #35

0

Afficher le fichier

Fichier : test_index.py Projet : xhochy/kartothek

        "Trying to update an index with the wrong column. Got `another_col` but expected `col`"
    )


@pytest.mark.parametrize(
    "dtype",
    [
        pa.binary(),
        pa.bool_(),
        pa.date32(),
        pa.float32(),
        pa.float64(),
        pa.int64(),
        pa.int8(),
        pa.string(),
        pa.timestamp("ns"),
    ],
)
def test_index_empty(store, dtype):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct={},
                                    dtype=dtype,
                                    index_storage_key=storage_key)
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col",
                                    index_storage_key=key1).load(store)
    assert index1 == index2

    index3 = pickle.loads(pickle.dumps(index1))

Exemple #36

0

Afficher le fichier

Fichier : result_set.py Projet : fred2305/superset

    def __init__(  # pylint: disable=too-many-locals,too-many-branches
        self,
        data: DbapiResult,
        cursor_description: DbapiDescription,
        db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
    ):
        self.db_engine_spec = db_engine_spec
        data = data or []
        column_names: List[str] = []
        pa_data: List[pa.Array] = []
        deduped_cursor_desc: List[Tuple[Any, ...]] = []
        numpy_dtype: List[Tuple[str, ...]] = []
        stringified_arr: np.ndarray

        if cursor_description:
            # get deduped list of column names
            column_names = dedup([col[0] for col in cursor_description])

            # fix cursor descriptor with the deduped names
            deduped_cursor_desc = [
                tuple([column_name, *list(description)[1:]]) for column_name,
                description in zip(column_names, cursor_description)
            ]

            # generate numpy structured array dtype
            numpy_dtype = [(column_name, "object")
                           for column_name in column_names]

        # only do expensive recasting if datatype is not standard list of tuples
        if data and (not isinstance(data, list)
                     or not isinstance(data[0], tuple)):
            data = [tuple(row) for row in data]
        array = np.array(data, dtype=numpy_dtype)
        if array.size > 0:
            for column in column_names:
                try:
                    pa_data.append(pa.array(array[column].tolist()))
                except (
                        pa.lib.ArrowInvalid,
                        pa.lib.ArrowTypeError,
                        pa.lib.ArrowNotImplementedError,
                        TypeError,  # this is super hackey,
                        # https://issues.apache.org/jira/browse/ARROW-7855
                ):
                    # attempt serialization of values as strings
                    stringified_arr = stringify_values(array[column])
                    pa_data.append(pa.array(stringified_arr.tolist()))

        if pa_data:  # pylint: disable=too-many-nested-blocks
            for i, column in enumerate(column_names):
                if pa.types.is_nested(pa_data[i].type):
                    # TODO: revisit nested column serialization once nested types
                    #  are added as a natively supported column type in Superset
                    #  (superset.utils.core.DbColumnType).
                    stringified_arr = stringify_values(array[column])
                    pa_data[i] = pa.array(stringified_arr.tolist())

                elif pa.types.is_temporal(pa_data[i].type):
                    # workaround for bug converting
                    # `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
                    # related: https://issues.apache.org/jira/browse/ARROW-5248
                    sample = self.first_nonempty(array[column])
                    if sample and isinstance(sample, datetime.datetime):
                        try:
                            if sample.tzinfo:
                                tz = sample.tzinfo
                                series = pd.Series(array[column],
                                                   dtype="datetime64[ns]")
                                series = pd.to_datetime(series).dt.tz_localize(
                                    tz)
                                pa_data[i] = pa.Array.from_pandas(
                                    series, type=pa.timestamp("ns", tz=tz))
                        except Exception as ex:  # pylint: disable=broad-except
                            logger.exception(ex)

        self.table = pa.Table.from_arrays(pa_data, names=column_names)
        self._type_dict: Dict[str, Any] = {}
        try:
            # The driver may not be passing a cursor.description
            self._type_dict = {
                col: db_engine_spec.get_datatype(deduped_cursor_desc[i][1])
                for i, col in enumerate(column_names) if deduped_cursor_desc
            }
        except Exception as ex:  # pylint: disable=broad-except
            logger.exception(ex)

Exemple #37

0

Afficher le fichier

Fichier : test_array.py Projet : taishi-h/arrow

    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)

Exemple #38

0

Afficher le fichier

Fichier : helper_objects.py Projet : ttomasz/yet-another-nyc-taxi-data-processing

    'total_amt': 'total_amount',
    'tpep_dropoff_datetime': 'dropoff_datetime',
    'tpep_pickup_datetime': 'pickup_datetime',
    'trip_distance': 'trip_distance',
    'trip_dropoff_datetime': 'dropoff_datetime',
    'trip_pickup_datetime': 'pickup_datetime',
    'vendor_id': 'vendor',
    'vendor_name': 'vendor',
    'vendorid': 'vendor',
    'trip_type': 'trip_type',
    'lpep_dropoff_datetime': 'dropoff_datetime',
    'lpep_pickup_datetime': 'pickup_datetime'
}

arrow_schema = pa.schema([
    ('pickup_datetime', pa.timestamp('ns')),
    ('dropoff_datetime', pa.timestamp('ns')),
    ('store_and_forward', pa.int8()),
    ('passenger_count', pa.int8()),
    ('trip_distance', pa.float32()),
    ('fare_amount', pa.float32()),
    ('tip_amount', pa.float32()),
    ('total_amount', pa.float32()),
    ('payment_type', pa.string()),
    ('trip_type', pa.string()),
    ('company', pa.string()),
    ('trip_duration_minutes', pa.float32()),
    ('year', pa.int16()),
    ('pickup_borough', pa.string()),
    ('pickup_zone', pa.string()),
    ('pickup_location_id', pa.int16()),

Exemple #39

0

Afficher le fichier

def test_timestamp_restore_timezone():
    # ARROW-5888, restore timezone from serialized metadata
    ty = pa.timestamp('ms', tz='America/New_York')
    arr = pa.array([1, 2, 3], type=ty)
    t = pa.table([arr], names=['f0'])
    _check_roundtrip(t)

Exemple #40

0

Afficher le fichier

def test_date_time_types(tempdir):
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.6')

    t0 = pa.timestamp('ms')
    data0 = np.arange(4, dtype='int64')
    a0 = pa.array(data0, type=t0)

    t1 = pa.timestamp('us')
    data1 = np.arange(4, dtype='int64')
    a1 = pa.array(data1, type=t1)

    t2 = pa.timestamp('ns')
    data2 = np.arange(4, dtype='int64')
    a2 = pa.array(data2, type=t2)

    table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]'])
    expected = pa.Table.from_arrays([a0, a1, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int64 for all timestamps supported by default
    filename = tempdir / 'int64_timestamps.parquet'
    _write_table(table, filename, version='2.6')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT64'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    t0_ns = pa.timestamp('ns')
    data0_ns = np.array(data0 * 1000000, dtype='int64')
    a0_ns = pa.array(data0_ns, type=t0_ns)

    t1_ns = pa.timestamp('ns')
    data1_ns = np.array(data1 * 1000, dtype='int64')
    a1_ns = pa.array(data1_ns, type=t1_ns)

    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int96 nanosecond timestamps produced upon request
    filename = tempdir / 'explicit_int96_timestamps.parquet'
    _write_table(table,
                 filename,
                 version='2.6',
                 use_deprecated_int96_timestamps=True)
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    # int96 nanosecond timestamps implied by flavor 'spark'
    filename = tempdir / 'spark_int96_timestamps.parquet'
    _write_table(table, filename, version='2.6', flavor='spark')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

Exemple #41

0

Afficher le fichier

import vaex.utils

supported_arrow_array_types = (pa.Array, pa.ChunkedArray)
supported_array_types = (np.ndarray, ) + supported_arrow_array_types
string_types = [pa.string(), pa.large_string()]
_type_names_int = [
    "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"
]
_type_names = ["float64", "float32"] + _type_names_int
map_arrow_to_numpy = {
    getattr(pa, name)(): np.dtype(name)
    for name in _type_names
}
map_arrow_to_numpy[pa.bool_()] = np.dtype("?")
for unit in 's ms us ns'.split():
    map_arrow_to_numpy[pa.timestamp(unit)] = np.dtype(f"datetime64[{unit}]")


def full(n, value, dtype):
    from .datatype import DataType
    dtype = DataType(dtype)
    values = np.full(n, value, dtype=dtype.numpy)
    if dtype.is_arrow:
        return pa.array(values)
    else:
        return values


def is_arrow_array(ar):
    return isinstance(ar, supported_arrow_array_types)

Exemple #42

0

Afficher le fichier

Fichier : test_arrow_converter.py Projet : moj-analytical-services/mojap-metadata

def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    expected_names = ["my_int", "my_double", "my_date", "my_decimal"]
    expected_types = [
        pa.int64(),
        pa.float64(),
        pa.date64(),
        pa.decimal128(10, 2)
    ]
    assert schema1.names == expected_names

    checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)]
    assert all(checks1)

    # Do schema2 assertions
    expected_names.append("my_timestamp")
    expected_types.append(pa.timestamp("s"))

    assert schema2.names == expected_names

    checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)]
    assert all(checks2)

    # Also check specific type properties
    assert schema2.field("my_decimal").type.precision == 10
    assert schema2.field("my_decimal").type.scale == 2
    assert schema2.field("my_timestamp").type.unit == "s"

Exemple #43

0

Afficher le fichier

Fichier : test_arrow_converter.py Projet : moj-analytical-services/mojap-metadata

 ("int32", pa.int32()),
 ("int64", pa.int64()),
 ("uint8", pa.uint8()),
 ("uint16", pa.uint16()),
 ("uint32", pa.uint32()),
 ("uint64", pa.uint64()),
 ("float16", pa.float16()),
 ("float32", pa.float32()),
 ("float64", pa.float64()),
 ("decimal128(38,1)", pa.decimal128(38, 1)),
 ("decimal128(1,2)", pa.decimal128(1, 2)),
 ("time32(s)", pa.time32("s")),
 ("time32(ms)", pa.time32("ms")),
 ("time64(us)", pa.time64("us")),
 ("time64(ns)", pa.time64("ns")),
 ("timestamp(s)", pa.timestamp("s")),
 ("timestamp(ms)", pa.timestamp("ms")),
 ("timestamp(us)", pa.timestamp("us")),
 ("timestamp(ns)", pa.timestamp("ns")),
 ("date32", pa.date32()),
 ("date64", pa.date64()),
 ("string", pa.string()),
 ("large_string", pa.large_string()),
 ("utf8", pa.utf8()),
 ("large_utf8", pa.large_utf8()),
 ("binary", pa.binary()),
 ("binary(128)", pa.binary(128)),
 ("large_binary", pa.large_binary()),
 ("struct<num:int64>", pa.struct([("num", pa.int64())])),
 ("list<int64>", pa.list_(pa.int64())),
 ("list_<list<int64>>", pa.list_(pa.list_(pa.int64()))),

Exemple #44

0

Afficher le fichier

def test_get_eq_func():
    for t in [
            pa.int8(),
            pa.int16(),
            pa.int32(),
            pa.int64(),
            pa.uint8(),
            pa.uint16(),
            pa.uint32(),
            pa.uint64(),
    ]:
        assert not get_eq_func(t)(0, 1)
        assert not get_eq_func(t)(None, 1)
        assert get_eq_func(t)(1, 1)
        assert get_eq_func(t)(None, None)
    t = pa.null()
    assert get_eq_func(t)("0", "1")
    assert get_eq_func(t)(None, "1")
    assert get_eq_func(t)("1", "1")
    assert get_eq_func(t)(None, None)
    t = pa.string()
    assert not get_eq_func(t)("0", "1")
    assert not get_eq_func(t)(None, "1")
    assert get_eq_func(t)("1", "1")
    assert get_eq_func(t)(None, None)
    t = pa.bool_()
    assert not get_eq_func(t)(False, True)
    assert not get_eq_func(t)(None, False)
    assert not get_eq_func(t)(None, True)
    assert get_eq_func(t)(True, True)
    assert get_eq_func(t)(False, False)
    assert get_eq_func(t)(None, None)
    for t in [pa.float16(), pa.float32(), pa.float64()]:
        assert not get_eq_func(t)(0.0, 1.1)
        assert get_eq_func(t)(1.1, 1.1)
        assert get_eq_func(t)(None, float("nan"))
        for n in [None, float("nan"), float("inf"), float("-inf")]:
            assert not get_eq_func(t)(None, 1.1)
            assert get_eq_func(t)(None, None)
    for t in [pa.timestamp("ns")]:
        for n in [None, pd.NaT]:
            assert not get_eq_func(t)(datetime(2020, 1, 1, 0),
                                      datetime(2020, 1, 1, 1))
            assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(datetime(2020, 1, 1, 1),
                                  datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(n, n)
    assert get_eq_func(pa.timestamp("ns"))(None, pd.NaT)
    for t in [pa.date32()]:
        for n in [None, pd.NaT]:
            assert get_eq_func(t)(datetime(2020, 1, 1, 0),
                                  datetime(2020, 1, 1, 1))
            assert not get_eq_func(t)(datetime(2020, 1, 1), datetime(
                2020, 1, 2).date())
            assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(datetime(2020, 1, 1).date(),
                                  datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(n, n)
    t = pa.struct([pa.field("a", pa.int32())])
    assert not get_eq_func(t)(dict(a=0), dict(a=1))
    assert not get_eq_func(t)(None, dict(a=1))
    assert get_eq_func(t)(dict(a=1), dict(a=1))
    assert get_eq_func(t)(None, None)
    t = pa.list_(pa.int32())
    assert not get_eq_func(t)([0], [1])
    assert not get_eq_func(t)(None, [1])
    assert get_eq_func(t)([1], [1])
    assert get_eq_func(t)(None, None)

Exemple #45

0

Afficher le fichier

def test_is_datetime():
    assert is_datetime(pyarrow.timestamp("us", tz=None))
    assert not is_datetime(pyarrow.timestamp("ms", tz=None))
    assert not is_datetime(pyarrow.timestamp("us", tz="UTC"))
    assert not is_datetime(pyarrow.string())

Exemple #46

0

Afficher le fichier

        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)

Exemple #47

0

Afficher le fichier

Fichier : test_array.py Projet : taishi-h/arrow

def test_cast_timestamp_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.timestamp('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)

Exemple #48

0

Afficher le fichier

def test_in_expr_todo():
    import pyarrow.gandiva as gandiva
    # TODO: Implement reasonable support for timestamp, time & date.
    # Current exceptions:
    # pyarrow.lib.ArrowException: ExpressionValidationError:
    # Evaluation expression for IN clause returns XXXX values are of typeXXXX

    # binary
    arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 2]

    # timestamp
    datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
    datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
    datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)

    arr = pa.array([datetime_1, datetime_2, datetime_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # time
    time_1 = datetime_1.time()
    time_2 = datetime_2.time()
    time_3 = datetime_3.time()

    arr = pa.array([time_1, time_2, time_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # date
    date_1 = datetime_1.date()
    date_2 = datetime_2.date()
    date_3 = datetime_3.date()

    arr = pa.array([date_1, date_2, date_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [date_2], pa.date32())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

Exemple #49

0

Afficher le fichier

def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None):
    column_data = {}
    array_data = []
    schema_names = []
    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col
    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t, format=date_format))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
        elif column.type.id == pa.date32().id:
            _converted_col = map(_date_converter, _col)
            array_data.append(pa.array(_converted_col, type=pa.date32()))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int32().id:
            # PyArrow 0.8.0 can cast int64 -> int32
            _col64 = pa.array(_col, type=pa.int64())
            array_data.append(_col64.cast(pa.int32()))
        elif column.type.id == pa.bool_().id:
            _col = map(_boolean_converter, _col)
            array_data.append(pa.array(_col, type=column.type))
        else:
            array_data.append(pa.array(_col, type=column.type))
        if isinstance(field_aliases, dict):
            schema_names.append(field_aliases.get(column.name, column.name))
        else:
            schema_names.append(column.name)
    return pa.RecordBatch.from_arrays(array_data, schema_names)

Exemple #50

0

Afficher le fichier

def pyarrow_timestamp():
    return pyarrow.timestamp("us", tz="UTC")

Exemple #51

0

Afficher le fichier

def test_cell_is_null_timestamp():
    _assert_condition_mask(
        {"A": pa.array([datetime.datetime.now(), None], pa.timestamp("ns"))},
        CELL("is_null", "A"),
        "01",
    )

Exemple #52

0

Afficher le fichier

Fichier : type_reader.py Projet : fairtide/DataFrame

def read_type(doc):
    t = doc[TYPE]

    if PARAM in doc:
        tp = doc[PARAM]
    else:
        tp = None

    if t == 'null':
        return pyarrow.null()

    if t == 'bool':
        return pyarrow.bool_()

    if t == 'int8':
        return pyarrow.int8()

    if t == 'int16':
        return pyarrow.int16()

    if t == 'int32':
        return pyarrow.int32()

    if t == 'int64':
        return pyarrow.int64()

    if t == 'uint8':
        return pyarrow.uint8()

    if t == 'uint16':
        return pyarrow.uint16()

    if t == 'uint32':
        return pyarrow.uint32()

    if t == 'uint64':
        return pyarrow.uint64()

    if t == 'float16':
        return pyarrow.float16()

    if t == 'float32':
        return pyarrow.float32()

    if t == 'float64':
        return pyarrow.float64()

    if t == 'date[d]':
        return pyarrow.date32()

    if t == 'date[ms]':
        return pyarrow.date64()

    if t == 'timestamp[s]':
        return pyarrow.timestamp('s')

    if t == 'timestamp[ms]':
        return pyarrow.timestamp('ms')

    if t == 'timestamp[us]':
        return pyarrow.timestamp('us')

    if t == 'timestamp[ns]':
        return pyarrow.timestamp('ns')

    if t == 'time[s]':
        return pyarrow.time32('s')

    if t == 'time[ms]':
        return pyarrow.time32('ms')

    if t == 'time[us]':
        return pyarrow.time64('us')

    if t == 'time[ns]':
        return pyarrow.time64('ns')

    if t == 'utf8':
        return pyarrow.utf8()

    if t == 'bytes':
        return pyarrow.binary()

    if t == 'factor':
        if tp is None:
            index_type = pyarrow.int32()
            dict_type = pyarrow.utf8()
        else:
            index_type = read_type(tp[INDEX])
            dict_type = read_type(tp[DICT])
        return pyarrow.dictionary(index_type, dict_type, False)

    if t == 'ordered':
        if tp is None:
            index_type = pyarrow.int32()
            dict_type = pyarrow.utf8()
        else:
            index_type = read_type(tp[INDEX])
            dict_type = read_type(tp[DICT])
        return pyarrow.dictionary(index_type, dict_type, True)

    if t == 'opaque':
        return pyarrow.binary(tp)

    if t == 'list':
        return pyarrow.list_(read_type(tp))

    if t == 'struct':
        return pyarrow.struct(
            [pyarrow.field(f[NAME], read_type(f)) for f in tp])

    raise ValueError(f'{t} is not supported BSON DataFrame type')

Exemple #53

0

Afficher le fichier

    arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'),
                    type='float32')
    assert arr3.type == 'float32'
    assert arr3.null_count == 0


def test_array_from_numpy_datetimeD():
    arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')

    result = pa.array(arr)
    expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32())
    assert result.equals(expected)


@pytest.mark.parametrize(('dtype', 'type'),
                         [('datetime64[s]', pa.timestamp('s')),
                          ('datetime64[ms]', pa.timestamp('ms')),
                          ('datetime64[us]', pa.timestamp('us')),
                          ('datetime64[ns]', pa.timestamp('ns'))])
def test_array_from_numpy_datetime(dtype, type):
    data = [
        None,
        datetime.datetime(2017, 4, 4, 12, 11, 10),
        datetime.datetime(2018, 1, 1, 0, 2, 0)
    ]

    # from numpy array
    arr = pa.array(np.array(data, dtype=dtype))
    expected = pa.array(data, type=type)
    assert arr.equals(expected)

Exemple #54

0

Afficher le fichier

Fichier : test_cursor.py Projet : laughingman7743/PyAthena

 def test_complex_as_arrow(self, arrow_cursor):
     table = arrow_cursor.execute("""
         SELECT
           col_boolean
           ,col_tinyint
           ,col_smallint
           ,col_int
           ,col_bigint
           ,col_float
           ,col_double
           ,col_string
           ,col_varchar
           ,col_timestamp
           ,CAST(col_timestamp AS time) AS col_time
           ,col_date
           ,col_binary
           ,col_array
           ,CAST(col_array AS json) AS col_array_json
           ,col_map
           ,CAST(col_map AS json) AS col_map_json
           ,col_struct
           ,col_decimal
         FROM one_row_complex
         """).as_arrow()
     assert table.shape[0] == 1
     assert table.shape[1] == 19
     assert table.schema == pa.schema([
         pa.field("col_boolean", pa.bool_()),
         pa.field("col_tinyint", pa.int8()),
         pa.field("col_smallint", pa.int16()),
         pa.field("col_int", pa.int32()),
         pa.field("col_bigint", pa.int64()),
         pa.field("col_float", pa.float32()),
         pa.field("col_double", pa.float64()),
         pa.field("col_string", pa.string()),
         pa.field("col_varchar", pa.string()),
         pa.field("col_timestamp", pa.timestamp("ms")),
         pa.field("col_time", pa.string()),
         pa.field("col_date", pa.timestamp("ms")),
         pa.field("col_binary", pa.string()),
         pa.field("col_array", pa.string()),
         pa.field("col_array_json", pa.string()),
         pa.field("col_map", pa.string()),
         pa.field("col_map_json", pa.string()),
         pa.field("col_struct", pa.string()),
         pa.field("col_decimal", pa.string()),
     ])
     assert [row for row in zip(*table.to_pydict().values())] == [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         "varchar",
         datetime(2017, 1, 1, 0, 0, 0),
         "00:00:00.000",
         datetime(2017, 1, 2, 0, 0, 0),
         "31 32 33",
         "[1, 2]",
         "[1,2]",
         "{1=2, 3=4}",
         '{"1":2,"3":4}',
         "{a=1, b=2}",
         "0.1",
     )]

Exemple #55

0

Afficher le fichier

Fichier : test_parquet.py Projet : youknowjack0/arrow

def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.Array.from_pandas(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.Array.from_pandas(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table,
                     expected=expected,
                     version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0', flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)

Exemple #56

0

Afficher le fichier

def pyarrow_datetime():
    return pyarrow.timestamp("us", tz=None)

Exemple #57

0

Afficher le fichier

def as_column(arbitrary, nan_as_null=True):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * numba device array
    * numpy array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - NumericalColumn for all other inputs.
    """
    from . import numerical, categorical, datetime

    if isinstance(arbitrary, Column):
        if not isinstance(arbitrary, TypedColumnBase):
            # interpret as numeric
            data = arbitrary.view(numerical.NumericalColumn,
                                  dtype=arbitrary.dtype)
        else:
            data = arbitrary

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif isinstance(arbitrary, np.ndarray):
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            raise NotImplementedError("Strings are not yet supported")
        elif isinstance(arbitrary, pa.NullArray):
            pamask = Buffer(np.empty(0, dtype='int8'))
            padata = Buffer(np.empty(0,
                                     dtype=arbitrary.type.to_pandas_dtype()))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=0,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))
        elif isinstance(arbitrary, pa.DictionaryArray):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    arbitrary.indices.type.to_pandas_dtype()))
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            arbitrary = arbitrary.cast(pa.int8())
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(np.array(arbitrary.buffers()[1]).view(dtype))
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    np.dtype(arbitrary.type.to_pandas_dtype())))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.core.common.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]))

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary))

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            data = as_column(pa.array(arbitrary))

    return data

Exemple #58

0

Afficher le fichier

Fichier : test_array.py Projet : yutannihilation/arrow

        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)

Exemple #59

0

Afficher le fichier

Fichier : test_cursor.py Projet : laughingman7743/PyAthena

 def test_complex_unload_as_arrow(self, arrow_cursor):
     # NOT_SUPPORTED: Unsupported Hive type: time
     # NOT_SUPPORTED: Unsupported Hive type: json
     table = arrow_cursor.execute("""
         SELECT
           col_boolean
           ,col_tinyint
           ,col_smallint
           ,col_int
           ,col_bigint
           ,col_float
           ,col_double
           ,col_string
           ,col_varchar
           ,col_timestamp
           ,col_date
           ,col_binary
           ,col_array
           ,col_map
           ,col_struct
           ,col_decimal
         FROM one_row_complex
         """).as_arrow()
     assert table.shape[0] == 1
     assert table.shape[1] == 16
     assert table.schema == pa.schema([
         pa.field("col_boolean", pa.bool_()),
         pa.field("col_tinyint", pa.int32()),
         pa.field("col_smallint", pa.int32()),
         pa.field("col_int", pa.int32()),
         pa.field("col_bigint", pa.int64()),
         pa.field("col_float", pa.float32()),
         pa.field("col_double", pa.float64()),
         pa.field("col_string", pa.string()),
         pa.field("col_varchar", pa.string()),
         pa.field("col_timestamp", pa.timestamp("ns")),
         pa.field("col_date", pa.date32()),
         pa.field("col_binary", pa.binary()),
         pa.field("col_array",
                  pa.list_(pa.field("array_element", pa.int32()))),
         pa.field("col_map",
                  pa.map_(pa.int32(), pa.field("entries", pa.int32()))),
         pa.field(
             "col_struct",
             pa.struct(
                 [pa.field("a", pa.int32()),
                  pa.field("b", pa.int32())]),
         ),
         pa.field("col_decimal", pa.decimal128(10, 1)),
     ])
     assert [row for row in zip(*table.to_pydict().values())] == [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         "varchar",
         pd.Timestamp(2017, 1, 1, 0, 0, 0),
         datetime(2017, 1, 2).date(),
         b"123",
         [1, 2],
         [(1, 2), (3, 4)],
         {
             "a": 1,
             "b": 2
         },
         Decimal("0.1"),
     )]

Exemple #60

0

Afficher le fichier

Fichier : test_testing.py Projet : CJWorkbench/cjwmodule

def test_make_column_timestamp_interpret_local_datetime_as_utc():
    column = make_column("A",
                         [datetime.datetime(2021, 4, 8, 13, 39, 1, 123456)])
    assert column.array.type == pa.timestamp("ns")  # no TZ info
    assert column.array.cast(pa.int64()) == pa.array([1617889141123456000])