Esempio n. 1
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Esempio n. 2
0
def test_sequence_timestamp_from_int_with_unit():
    data = [1]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"

    with pytest.raises(pa.ArrowException):
        class CustomClass():
            pass
        pa.array([1, CustomClass()], type=ns)
        pa.array([1, CustomClass()], type=pa.date32())
        pa.array([1, CustomClass()], type=pa.date64())
Esempio n. 3
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Esempio n. 4
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Esempio n. 5
0
    def test_datetime64_to_date32(self):
        # ARROW-1718
        arr = pa.array([date(2017, 10, 23), None])
        c = pa.Column.from_array("d", arr)
        s = c.to_pandas()

        arr2 = pa.Array.from_pandas(s, type=pa.date32())

        assert arr2.equals(arr.cast('date32'))
Esempio n. 6
0
def test_sequence_date32():
    data = [datetime.date(2000, 1, 1), None]
    arr = pa.array(data, type=pa.date32())

    data2 = [10957, None]
    arr2 = pa.array(data2, type=pa.date32())

    for x in [arr, arr2]:
        assert len(x) == 2
        assert x.type == pa.date32()
        assert x.null_count == 1
        assert x[0].as_py() == datetime.date(2000, 1, 1)
        assert x[1] is pa.NA

    # Overflow
    data3 = [2**32, None]
    with pytest.raises(pa.ArrowException):
        pa.array(data3, type=pa.date32())
Esempio n. 7
0
def test_sequence_date():
    data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
            datetime.date(2040, 2, 26)]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.date32()
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.date(2000, 1, 1)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.date(1970, 1, 1)
    assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Esempio n. 9
0
 def cast_series(s, t):
     if type(t) == pa.TimestampType:
         # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680
         return _check_series_convert_timestamps_internal(s.fillna(0))\
             .values.astype('datetime64[us]', copy=False)
     elif t == pa.date32():
         # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8
         return s.dt.date
     elif t is None or s.dtype == t.to_pandas_dtype():
         return s
     else:
         return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
    def test_dates_from_integers(self):
        t1 = pa.date32()
        t2 = pa.date64()

        arr = np.array([17259, 17260, 17261], dtype='int32')
        arr2 = arr.astype('int64') * 86400000

        a1 = pa.array(arr, type=t1)
        a2 = pa.array(arr2, type=t2)

        expected = date(2017, 4, 3)
        assert a1[0].as_py() == expected
        assert a2[0].as_py() == expected
 def test_date_infer(self):
     df = pd.DataFrame({
         'date': [date(2000, 1, 1),
                  None,
                  date(1970, 1, 1),
                  date(2040, 2, 26)]})
     table = pa.Table.from_pandas(df, preserve_index=False)
     field = pa.field('date', pa.date32())
     schema = pa.schema([field])
     assert table.schema.equals(schema)
     result = table.to_pandas()
     expected = df.copy()
     expected['date'] = pd.to_datetime(df['date'])
     tm.assert_frame_equal(result, expected)
Esempio n. 12
0
def test_datetime_subclassing():
    class MyDate(datetime.date):
        pass
    data = [
        MyDate(2007, 7, 13),
    ]
    date_type = pa.date32()
    arr_date = pa.array(data, type=date_type)
    assert len(arr_date) == 1
    assert arr_date.type == date_type
    assert arr_date[0].as_py() == datetime.date(2007, 7, 13)

    class MyDatetime(datetime.datetime):
        pass

    data = [
        MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)
Esempio n. 13
0
File: jvm.py Progetto: rok/arrow
def _from_jvm_date_type(jvm_type):
    """
    Convert a JVM date type to its Python equivalent

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Date

    Returns
    -------
    typ: pyarrow.DataType
    """
    day_unit = jvm_type.getUnit().toString()
    if day_unit == 'DAY':
        return pa.date32()
    elif day_unit == 'MILLISECOND':
        return pa.date64()
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_struct(actual)
    assert actual.num_children == len(fields)
    assert actual.equals(expected)
Esempio n. 15
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
def test_sql_types(redshift_table):
    table = redshift_table
    df = get_df()
    df.drop(["binary"], axis=1, inplace=True)
    con = wr.redshift.connect("aws-data-wrangler-redshift")
    wr.redshift.to_sql(
        df=df,
        con=con,
        table=table,
        schema="public",
        mode="overwrite",
        index=True,
        dtype={"iint32": "INTEGER"},
    )
    df = wr.redshift.read_sql_query(f"SELECT * FROM public.{table}", con)
    ensure_data_types(df, has_list=False)
    dfs = wr.redshift.read_sql_query(
        sql=f"SELECT * FROM public.{table}",
        con=con,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "ddouble": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    con.close()
Esempio n. 17
0
    def test_date_objects_typed(self):
        arr = np.array([
            datetime.date(2017, 4, 3), None,
            datetime.date(2017, 4, 4),
            datetime.date(2017, 4, 5)
        ],
                       dtype=object)

        arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
        arr_i8 = arr_i4.astype('int64') * 86400000
        mask = np.array([False, True, False, False])

        t32 = pa.date32()
        t64 = pa.date64()

        a32 = pa.Array.from_pandas(arr, type=t32)
        a64 = pa.Array.from_pandas(arr, type=t64)

        a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32)
        a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64)

        assert a32.equals(a32_expected)
        assert a64.equals(a64_expected)

        # Test converting back to pandas
        colnames = ['date32', 'date64']
        table = pa.Table.from_arrays([a32, a64], colnames)
        table_pandas = table.to_pandas()

        ex_values = (np.array(
            ['2017-04-03', '2017-04-04', '2017-04-04', '2017-04-05'],
            dtype='datetime64[D]').astype('datetime64[ns]'))
        ex_values[1] = pd.NaT.value
        expected_pandas = pd.DataFrame(
            {
                'date32': ex_values,
                'date64': ex_values
            }, columns=colnames)
        tm.assert_frame_equal(table_pandas, expected_pandas)
Esempio n. 18
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Esempio n. 19
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Esempio n. 20
0
def test_sql_types(sqlserver_table, sqlserver_con):
    table = sqlserver_table
    df = get_df()
    df.drop(["binary"], axis=1, inplace=True)
    wr.sqlserver.to_sql(
        df=df,
        con=sqlserver_con,
        table=table,
        schema="dbo",
        mode="overwrite",
        index=True,
        dtype={"iint32": "INTEGER"},
    )
    df = wr.sqlserver.read_sql_query(f"SELECT * FROM dbo.{table}",
                                     sqlserver_con)
    ensure_data_types(df, has_list=False)
    dfs = wr.sqlserver.read_sql_query(
        sql=f"SELECT * FROM dbo.{table}",
        con=sqlserver_con,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "ddouble": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
Esempio n. 21
0
 def _convert_to_parquet(self):
     import pyarrow as pa
     # from .utils import PandasPyArrowCursor
     pq_schema = []
     GLUE_HIVE_TO_PQ_MAPPING = {
         'bigint': pa.int64(),
         'int': pa.int32(),
         'integer': pa.int32(),
         'smallint': pa.int32(),
         'string': pa.utf8(),
         'boolean': pa.bool_(),
         'float': pa.float32(),
         'double': pa.float64()
     }
     for e in self._metadata['StorageDescriptor']['Columns']:
         comment = json.loads(e.get('Comment', '{}'))
         writer_type = e['Type']
         nullable = comment.get('nullable',
                                True) and (not comment.get('is_pk', False))
         if writer_type in GLUE_HIVE_TO_PQ_MAPPING:
             final_type = GLUE_HIVE_TO_PQ_MAPPING[writer_type]
         elif 'decimal' in writer_type:
             precision, scale = re.match(r'decimal\((\d+),(\d+)\)',
                                         writer_type).groups()
             final_type = pa.decimal128(int(precision), int(scale))
         elif 'timestamp' in writer_type:
             unit = comment.get('unit', 'us')
             final_type = pa.timestamp(unit)
         elif 'date' in writer_type:
             final_type = pa.date32()
         else:
             raise NotImplementedError(
                 '{} is not supported'.format(writer_type))
         pq_schema.append(pa.field(e['Name'], final_type,
                                   nullable=nullable))
     final_schema = pa.schema(pq_schema)
     # final_schema = PandasPyArrowCursor.make_pandas_meta(final_schema)
     return final_schema
Esempio n. 22
0
def test_sequence_timestamp_from_int_with_unit():
    data = [1]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert repr(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"
    assert str(arr_s[0]) == "1970-01-01 00:00:01"

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert repr(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"
    assert str(arr_ms[0]) == "1970-01-01 00:00:00.001000"

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert repr(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"
    assert str(arr_us[0]) == "1970-01-01 00:00:00.000001"

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert repr(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"
    assert str(arr_ns[0]) == "1970-01-01 00:00:00.000000001"

    with pytest.raises(pa.ArrowException):
        class CustomClass():
            pass
        pa.array([1, CustomClass()], type=ns)
        pa.array([1, CustomClass()], type=pa.date32())
        pa.array([1, CustomClass()], type=pa.date64())
Esempio n. 23
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.map_(pa.string(), pa.int8()),
        pa.struct([pa.field('a', 'int8'),
                   pa.field('b', 'string')]),
        pa.union([pa.field('a', pa.int8()),
                  pa.field('b', pa.int16())], pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.int8()),
                  pa.field('b', pa.int16())], pa.lib.UnionMode_DENSE),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal128(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Esempio n. 24
0
def _build_date_builder(name: str, part: str):
    if part == "date":
        unit = "day"

        def fn(st: time.struct_time) -> datetime.date:
            return datetime.date(st.tm_year, st.tm_mon, st.tm_mday)

    elif part == "dateweek":
        unit = "week"

        def fn(st: time.struct_time) -> datetime.date:
            return datetime.date.fromordinal(
                datetime.date(st.tm_year, st.tm_mon, st.tm_mday).toordinal()
                - st.tm_wday
            )

    elif part == "datemonth":
        unit = "month"

        def fn(st: time.struct_time) -> datetime.date:
            return datetime.date(st.tm_year, st.tm_mon, 1)

    elif part == "datequarter":
        unit = "quarter"

        def fn(st: time.struct_time) -> datetime.date:
            return datetime.date(
                st.tm_year, [0, 1, 1, 1, 4, 4, 4, 7, 7, 7, 10, 10, 10][st.tm_mon], 1
            )

    else:
        unit = "year"

        def fn(st: time.struct_time) -> datetime.date:
            return datetime.date(st.tm_year, 1, 1)

    return pa.field(name, pa.date32(), metadata={"unit": unit}), fn
    def test_date_objects_typed(self):
        arr = np.array([
            date(2017, 4, 3),
            None,
            date(2017, 4, 4),
            date(2017, 4, 5)], dtype=object)

        arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
        arr_i8 = arr_i4.astype('int64') * 86400000
        mask = np.array([False, True, False, False])

        t32 = pa.date32()
        t64 = pa.date64()

        a32 = pa.array(arr, type=t32)
        a64 = pa.array(arr, type=t64)

        a32_expected = pa.array(arr_i4, mask=mask, type=t32)
        a64_expected = pa.array(arr_i8, mask=mask, type=t64)

        assert a32.equals(a32_expected)
        assert a64.equals(a64_expected)

        # Test converting back to pandas
        colnames = ['date32', 'date64']
        table = pa.Table.from_arrays([a32, a64], colnames)
        table_pandas = table.to_pandas()

        ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
                               '2017-04-05'],
                              dtype='datetime64[D]')
                     .astype('datetime64[ns]'))
        ex_values[1] = pd.NaT.value
        expected_pandas = pd.DataFrame({'date32': ex_values,
                                        'date64': ex_values},
                                       columns=colnames)
        tm.assert_frame_equal(table_pandas, expected_pandas)
Esempio n. 26
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Esempio n. 27
0
def test_multiple_factories(multisourcefs):
    src1 = ds.factory('/plain', filesystem=multisourcefs, format='parquet')
    src2 = ds.factory('/schema',
                      filesystem=multisourcefs,
                      format='parquet',
                      partitioning=['week', 'color'])
    src3 = ds.factory('/hive',
                      filesystem=multisourcefs,
                      format='parquet',
                      partitioning='hive')

    assembled = ds.dataset([src1, src2, src3])
    assert isinstance(assembled, ds.Dataset)

    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('week', pa.int32()),
        ('month', pa.int32()),
        ('year', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema)
Esempio n. 28
0
def athena2pyarrow(dtype: str) -> pa.DataType:  # pylint: disable=too-many-return-statements
    """Athena to PyArrow data types conversion."""
    dtype = dtype.lower().replace(" ", "")
    if dtype == "tinyint":
        return pa.int8()
    if dtype == "smallint":
        return pa.int16()
    if dtype in ("int", "integer"):
        return pa.int32()
    if dtype == "bigint":
        return pa.int64()
    if dtype in ("float", "real"):
        return pa.float32()
    if dtype == "double":
        return pa.float64()
    if dtype == "boolean":
        return pa.bool_()
    if (dtype == "string") or dtype.startswith("char") or dtype.startswith("varchar"):
        return pa.string()
    if dtype == "timestamp":
        return pa.timestamp(unit="ns")
    if dtype == "date":
        return pa.date32()
    if dtype in ("binary" or "varbinary"):
        return pa.binary()
    if dtype.startswith("decimal") is True:
        precision, scale = dtype.replace("decimal(", "").replace(")", "").split(sep=",")
        return pa.decimal128(precision=int(precision), scale=int(scale))
    if dtype.startswith("array") is True:
        return pa.list_(value_type=athena2pyarrow(dtype=dtype[6:-1]), list_size=-1)
    if dtype.startswith("struct") is True:
        return pa.struct([(f.split(":", 1)[0], athena2pyarrow(f.split(":", 1)[1])) for f in _split_struct(dtype[7:-1])])
    if dtype.startswith("map") is True:
        parts: List[str] = _split_map(s=dtype[4:-1])
        return pa.map_(athena2pyarrow(parts[0]), athena2pyarrow(parts[1]))
    raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}")
Esempio n. 29
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())
Esempio n. 30
0
def infer_schema(chunk):
    fields = []

    columns = chunk.columns
    dtypes = chunk.dtypes

    i = 0
    for column in columns:

        dtype = dtypes[i]

        if dtype.name == 'object':
            if column.endswith('date'):
                fields.append(pa.field(column, pa.date32()))
            else:
                fields.append(pa.field(column, pa.string()))
        else:
            fields.append(pa.field(column, pa.type_for_alias(dtype.name)))

        i += 1

    schema = pa.schema(fields)

    return schema
Esempio n. 31
0
def test_index_as_flat_series_date():
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            datetime.date(2017, 1, 2): ["part_1", "part_2"],
            datetime.date(2018, 2, 3): ["part_1"],
        },
        dtype=pa.date32(),
    )
    ser = index1.as_flat_series()
    ser = ser.sort_index()
    expected = pd.Series(
        ["part_1", "part_2", "part_1"],
        index=pd.Index(
            [
                datetime.date(2017, 1, 2),
                datetime.date(2017, 1, 2),
                datetime.date(2018, 2, 3),
            ],
            name="col",
        ),
        name="partition",
    )
    assert_series_equal(ser, expected)
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())
Esempio n. 33
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    )
Esempio n. 34
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([
            pa.field('a', pa.int32()),
            pa.field('b', pa.int8()),
            pa.field('c', pa.string())
        ])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
Esempio n. 35
0
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([
    pa.float16(),
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
temporal_types = st.one_of(date_types, time_types, timestamp_types)
Esempio n. 36
0
    for i, field in enumerate(fields):
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i


def test_fields_weakrefable():
    field = pa.field('a', pa.int32())
    wr = weakref.ref(field)
    assert wr() is not None
    del field
    assert wr() is None


@pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32),
                                          (pa.date64(), types.is_date64),
                                          (pa.time32('s'), types.is_time32),
                                          (pa.time64('ns'), types.is_time64),
                                          (pa.int8(), types.is_int8),
                                          (pa.int16(), types.is_int16),
                                          (pa.int32(), types.is_int32),
                                          (pa.int64(), types.is_int64),
                                          (pa.uint8(), types.is_uint8),
                                          (pa.uint16(), types.is_uint16),
                                          (pa.uint32(), types.is_uint32),
                                          (pa.uint64(), types.is_uint64),
                                          (pa.float16(), types.is_float16),
                                          (pa.float32(), types.is_float32),
                                          (pa.float64(), types.is_float64)])
def test_exact_primitive_types(t, check_func):
Esempio n. 37
0
def test_array_from_numpy_datetimeD():
    arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')

    result = pa.array(arr)
    expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32())
    assert result.equals(expected)
Esempio n. 38
0
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected
Esempio n. 39
0
def test_date32_overflow():
    # Overflow
    data3 = [2**32, None]
    with pytest.raises((OverflowError, pa.ArrowException)):
        pa.array(data3, type=pa.date32())
Esempio n. 40
0
signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=0, max_value=38),
                         scale=st.integers(min_value=0, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.builds(pa.timestamp,
                            unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
                            tz=tzst.timezones())
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(null_type, bool_type, binary_type, string_type,
                            numeric_types, temporal_types)

metadata = st.dictionaries(st.text(), st.text())
Esempio n. 41
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Esempio n. 42
0
def test_array_from_numpy_datetimeD():
    arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')

    result = pa.array(arr)
    expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32())
    assert result.equals(expected)
Esempio n. 43
0
def test_sequence_date():
    data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
            datetime.date(2040, 2, 26)]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.date32()
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.date(2000, 1, 1)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.date(1970, 1, 1)
    assert arr[3].as_py() == datetime.date(2040, 2, 26)


@pytest.mark.parametrize('input',
                         [(pa.date32(), [10957, None]),
                          (pa.date64(), [10957 * 86400000, None])])
def test_sequence_explicit_types(input):
    t, ex_values = input
    data = [datetime.date(2000, 1, 1), None]
    arr = pa.array(data, type=t)
    arr2 = pa.array(ex_values, type=t)

    for x in [arr, arr2]:
        assert len(x) == 2
        assert x.type == t
        assert x.null_count == 1
        assert x[0].as_py() == datetime.date(2000, 1, 1)
        assert x[1] is pa.NA

Esempio n. 44
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table,
                     expected=expected,
                     version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0', flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Esempio n. 45
0
def test_datetime_subclassing():
    data = [
        MyDate(2007, 7, 13),
    ]
    date_type = pa.date32()
    arr_date = pa.array(data, type=date_type)
    assert len(arr_date) == 1
    assert arr_date.type == date_type
    assert arr_date[0].as_py() == datetime.date(2007, 7, 13)

    data = [
        MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    data = [
        MyTimedelta(123, 456, 1002),
    ]

    s = pa.duration('s')
    ms = pa.duration('ms')
    us = pa.duration('us')

    arr_s = pa.array(data)
    assert len(arr_s) == 1
    assert arr_s.type == us
    assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002)

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.timedelta(123, 456)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002)
Esempio n. 46
0
'''Asset definitions for the simple_lakehouse example.'''
import pandas as pd
from lakehouse import Column, computed_table, source_table
from pyarrow import date32, float64, string

sfo_q2_weather_sample_table = source_table(
    storage_key='filesystem',
    path=('data', ),
    columns=[Column('tmpf', float64()),
             Column('valid_date', string())],
)


@computed_table(
    storage_key='filesystem',
    input_assets=[sfo_q2_weather_sample_table],
    columns=[Column('valid_date', date32()),
             Column('max_tmpf', float64())],
)
def daily_temperature_highs_table(
        sfo_q2_weather_sample: pd.DataFrame) -> pd.DataFrame:
    '''Computes the temperature high for each day'''
    sfo_q2_weather_sample['valid_date'] = pd.to_datetime(
        sfo_q2_weather_sample['valid'])
    return sfo_q2_weather_sample.groupby('valid_date').max().rename(
        columns={'tmpf': 'max_tmpf'})
Esempio n. 47
0
    arr = pa.array(narr)
    assert narr.dtype == arr.to_numpy().dtype
    np.testing.assert_array_equal(narr, arr.to_numpy())
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
    assert arr.equals(expected)

Esempio n. 48
0
def test_sql(redshift_table, postgresql_table, mysql_table,
             databases_parameters, db_type):
    if db_type == "postgresql":
        table = postgresql_table
    elif db_type == "mysql":
        table = mysql_table
    else:
        table = redshift_table
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}",
                                   echo=False)
    index = True if engine.name == "redshift" else False
    wr.db.to_sql(
        df=df,
        con=engine,
        name=table,
        schema=databases_parameters[db_type]["schema"],
        if_exists="replace",
        index=index,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=databases_parameters[db_type]["host"],
        port=databases_parameters[db_type]["port"],
        database=databases_parameters[db_type]["database"],
        user=databases_parameters["user"],
        password=databases_parameters["password"],
        echo=False,
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name=table,
            schema=databases_parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = databases_parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table=table,
                                  schema=schema,
                                  index_col="index")
        assert df.shape == (3, 1)
Esempio n. 49
0
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(arbitrary.data,
                            arbitrary.dtype,
                            mask=arbitrary.mask,
                            categories=categories)

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype='int8')
            else:
                sbuf = np.empty(0, dtype='int8')
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype='int32')
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype='int8')

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if pd.api.types.is_categorical_dtype(dtype):
                    data = as_column(pd.Series(arbitrary, dtype='category'),
                                     nan_as_null=nan_as_null)
                else:
                    if dtype is None:
                        np_type = None
                    else:
                        np_type = np.dtype(dtype)
                    data = as_column(np.array(arbitrary, dtype=np_type),
                                     nan_as_null=nan_as_null)

    return data
Esempio n. 50
0
def test_schema_pyarrow_types():
    field_name = "column1"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "int", "bitWidth": 8, "isSigned": True},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.int8()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_no_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "timestamp"},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("ns")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "column_timestamp_with_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "timestamp", "unit": "MICROSECOND"},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.timestamp("us")
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "date_with_day_unit"
    metadata = {b"metadata_k": b"metadata_v"}
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "date", "unit": "DAY"},
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.date32()
    assert dict(pyarrow_field.metadata) == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_list"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "list"},
            "children": [{"type": {"name": "int", "bitWidth": 32, "isSigned": True}}],
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.list_(
        pyarrow.field("element", pyarrow.int32())
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "dictionary"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "int", "bitWidth": 32, "isSigned": True},
            "children": [],
            "dictionary": {
                "id": 0,
                "indexType": {"name": "int", "bitWidth": 16, "isSigned": True},
            },
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "struct_array"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "nullable": False,
            "metadata": metadata,
            "type": {"name": "list"},
            "children": [],
            "dictionary": {
                "id": 0,
                "indexType": {"name": "int", "bitWidth": 32, "isSigned": True},
            },
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(
        pyarrow.int32(),
        pyarrow.list_(
            pyarrow.field(
                "element",
                pyarrow.struct(
                    [pyarrow.field("val", pyarrow.int32(), False, metadata)]
                ),
            )
        ),
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    field_name = "simple_dictionary"
    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "metadata": {"metadata_k": "metadata_v"},
            "nullable": False,
            "type": {"name": "dictionary"},
            "dictionary": {"indexType": {"type": {"name": "int", "bitWidth": 8}}},
            "children": [{"type": {"name": "int", "bitWidth": 32}}],
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32())
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False

    pyarrow_field = pyarrow_field_from_dict(
        {
            "name": field_name,
            "type": {"name": "struct"},
            "children": [
                {
                    "name": "x",
                    "type": {"name": "int", "bitWidth": 64},
                    "nullable": True,
                    "metadata": {},
                }
            ],
            "metadata": {"metadata_k": "metadata_v"},
            "nullable": False,
        }
    )
    assert pyarrow_field.name == field_name
    assert pyarrow_field.type == pyarrow.struct(
        [pyarrow.field("x", pyarrow.int64(), True, {})]
    )
    assert pyarrow_field.metadata == metadata
    assert pyarrow_field.nullable is False
Esempio n. 51
0
        "INT64",
        pyarrow.uint64().id:
        "INT64",
        pyarrow.float16().id:
        "FLOAT64",
        pyarrow.float32().id:
        "FLOAT64",
        pyarrow.float64().id:
        "FLOAT64",
        pyarrow.time32("ms").id:
        "TIME",
        pyarrow.time64("ns").id:
        "TIME",
        pyarrow.timestamp("ns").id:
        "TIMESTAMP",
        pyarrow.date32().id:
        "DATE",
        pyarrow.date64().id:
        "DATETIME",  # because millisecond resolution
        pyarrow.binary().id:
        "BYTES",
        pyarrow.string().id:
        "STRING",  # also alias for pyarrow.utf8()
        # The exact scale and precision don't matter, see below.
        pyarrow.decimal128(38, scale=9).id:
        "NUMERIC",
    }

    if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
        BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
        # The exact decimal's scale and precision are not important, as only
Esempio n. 52
0
from pandas import DataFrame as PandasDF
from pyarrow import date32, float64, string
from pyspark.sql import DataFrame as SparkDF
from pyspark.sql import Window
from pyspark.sql import functions as f

sfo_q2_weather_sample_table = source_table(
    path=("sfo_q2_weather_sample", ),
    columns=[Column("tmpf", float64()),
             Column("valid_date", string())],
)


@computed_table(
    input_assets=[sfo_q2_weather_sample_table],
    columns=[Column("valid_date", date32()),
             Column("max_tmpf", float64())],
)
def daily_temperature_highs_table(sfo_q2_weather_sample: PandasDF) -> PandasDF:
    """Computes the temperature high for each day"""
    sfo_q2_weather_sample["valid_date"] = pd.to_datetime(
        sfo_q2_weather_sample["valid"])
    return sfo_q2_weather_sample.groupby("valid_date").max().rename(
        columns={"tmpf": "max_tmpf"})


@computed_table(
    input_assets=[daily_temperature_highs_table],
    columns=[Column("valid_date", date32()),
             Column("max_tmpf", float64())],
)
Esempio n. 53
0
def test_sequence_date():
    data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
            datetime.date(2040, 2, 26)]
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.type == pa.date32()
    assert arr.null_count == 1
    assert arr[0].as_py() == datetime.date(2000, 1, 1)
    assert arr[1].as_py() is None
    assert arr[2].as_py() == datetime.date(1970, 1, 1)
    assert arr[3].as_py() == datetime.date(2040, 2, 26)


@pytest.mark.parametrize('input',
                         [(pa.date32(), [10957, None]),
                          (pa.date64(), [10957 * 86400000, None])])
def test_sequence_explicit_types(input):
    t, ex_values = input
    data = [datetime.date(2000, 1, 1), None]
    arr = pa.array(data, type=t)
    arr2 = pa.array(ex_values, type=t)

    for x in [arr, arr2]:
        assert len(x) == 2
        assert x.type == t
        assert x.null_count == 1
        assert x[0].as_py() == datetime.date(2000, 1, 1)
        assert x[1] is pa.NA

Esempio n. 54
0
 (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
 (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
     '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
     '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
     ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
     '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
 (pa.date32(), '{"name":"date","unit":"DAY"}'),
 (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
 (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'),
 (pa.string(), '{"name":"utf8"}'),
 (pa.binary(), '{"name":"binary"}'),
 (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'),
 # TODO(ARROW-2609): complex types that have children
 # pa.list_(pa.int32()),
 # pa.struct([pa.field('a', pa.int32()),
 #            pa.field('b', pa.int8()),
 #            pa.field('c', pa.string())]),
 # pa.union([pa.field('a', pa.binary(10)),
 #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
 # pa.union([pa.field('a', pa.binary(10)),
 #           pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
 # TODO: DictionaryType requires a vector in the type
Esempio n. 55
0
def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    date_data = [
        [],
        [date(2018, 1, 1), date(2032, 12, 30)],
        [date(2000, 6, 7)],
        None,
        [date(1969, 6, 9), date(1972, 7, 3)]
    ]
    time_data = [
        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
        [],
        [time(22, 5, 59)],
        None,
        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
    ]

    temporal_pairs = [
        (pa.date32(), date_data),
        (pa.date64(), date_data),
        (pa.time32('s'), time_data),
        (pa.time32('ms'), time_data),
        (pa.time64('us'), time_data)
    ]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Esempio n. 56
0
def test_date32_overflow():
    # Overflow
    data3 = [2**32, None]
    with pytest.raises(pa.ArrowException):
        pa.array(data3, type=pa.date32())
    },
    "timestamp": {
        "type": "long",
        "logicalType": "timestamp-micros"
    },
}
# This dictionary is duplicated in bigquery/google/cloud/bigquery/_pandas_helpers.py
# When modifying it be sure to update it there as well.
BQ_TO_ARROW_TYPES = {
    "int64": pyarrow.int64(),
    "float64": pyarrow.float64(),
    "bool": pyarrow.bool_(),
    "numeric": pyarrow.decimal128(38, 9),
    "string": pyarrow.utf8(),
    "bytes": pyarrow.binary(),
    "date": pyarrow.date32(),  # int32 days since epoch
    "datetime": pyarrow.timestamp("us"),
    "time": pyarrow.time64("us"),
    "timestamp": pyarrow.timestamp("us", tz="UTC"),
}
SCALAR_COLUMNS = [
    {
        "name": "int_col",
        "type": "int64"
    },
    {
        "name": "float_col",
        "type": "float64"
    },
    {
        "name": "num_col",
Esempio n. 58
0
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])