def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_sequence_timestamp_from_int_with_unit(): data = [1] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')" arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')" arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')" arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')" with pytest.raises(pa.ArrowException): class CustomClass(): pass pa.array([1, CustomClass()], type=ns) pa.array([1, CustomClass()], type=pa.date32()) pa.array([1, CustomClass()], type=pa.date64())
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_datetime64_to_date32(self): # ARROW-1718 arr = pa.array([date(2017, 10, 23), None]) c = pa.Column.from_array("d", arr) s = c.to_pandas() arr2 = pa.Array.from_pandas(s, type=pa.date32()) assert arr2.equals(arr.cast('date32'))
def test_sequence_date32(): data = [datetime.date(2000, 1, 1), None] arr = pa.array(data, type=pa.date32()) data2 = [10957, None] arr2 = pa.array(data2, type=pa.date32()) for x in [arr, arr2]: assert len(x) == 2 assert x.type == pa.date32() assert x.null_count == 1 assert x[0].as_py() == datetime.date(2000, 1, 1) assert x[1] is pa.NA # Overflow data3 = [2**32, None] with pytest.raises(pa.ArrowException): pa.array(data3, type=pa.date32())
def test_sequence_date(): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.date32() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.Array.from_pandas(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.Array.from_pandas(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.Array.from_pandas(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) _check_roundtrip(table, expected=expected, version='2.0') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def cast_series(s, t): if type(t) == pa.TimestampType: # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680 return _check_series_convert_timestamps_internal(s.fillna(0))\ .values.astype('datetime64[us]', copy=False) elif t == pa.date32(): # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8 return s.dt.date elif t is None or s.dtype == t.to_pandas_dtype(): return s else: return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
def test_dates_from_integers(self): t1 = pa.date32() t2 = pa.date64() arr = np.array([17259, 17260, 17261], dtype='int32') arr2 = arr.astype('int64') * 86400000 a1 = pa.array(arr, type=t1) a2 = pa.array(arr2, type=t2) expected = date(2017, 4, 3) assert a1[0].as_py() == expected assert a2[0].as_py() == expected
def test_date_infer(self): df = pd.DataFrame({ 'date': [date(2000, 1, 1), None, date(1970, 1, 1), date(2040, 2, 26)]}) table = pa.Table.from_pandas(df, preserve_index=False) field = pa.field('date', pa.date32()) schema = pa.schema([field]) assert table.schema.equals(schema) result = table.to_pandas() expected = df.copy() expected['date'] = pd.to_datetime(df['date']) tm.assert_frame_equal(result, expected)
def test_datetime_subclassing(): class MyDate(datetime.date): pass data = [ MyDate(2007, 7, 13), ] date_type = pa.date32() arr_date = pa.array(data, type=date_type) assert len(arr_date) == 1 assert arr_date.type == date_type assert arr_date[0].as_py() == datetime.date(2007, 7, 13) class MyDatetime(datetime.datetime): pass data = [ MyDatetime(2007, 7, 13, 1, 23, 34, 123456), ] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 0) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
def _from_jvm_date_type(jvm_type): """ Convert a JVM date type to its Python equivalent Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Date Returns ------- typ: pyarrow.DataType """ day_unit = jvm_type.getUnit().toString() if day_unit == 'DAY': return pa.date32() elif day_unit == 'MILLISECOND': return pa.date64()
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_struct(actual) assert actual.num_children == len(fields) assert actual.equals(expected)
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
def test_sql_types(redshift_table): table = redshift_table df = get_df() df.drop(["binary"], axis=1, inplace=True) con = wr.redshift.connect("aws-data-wrangler-redshift") wr.redshift.to_sql( df=df, con=con, table=table, schema="public", mode="overwrite", index=True, dtype={"iint32": "INTEGER"}, ) df = wr.redshift.read_sql_query(f"SELECT * FROM public.{table}", con) ensure_data_types(df, has_list=False) dfs = wr.redshift.read_sql_query( sql=f"SELECT * FROM public.{table}", con=con, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "ddouble": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) con.close()
def test_date_objects_typed(self): arr = np.array([ datetime.date(2017, 4, 3), None, datetime.date(2017, 4, 4), datetime.date(2017, 4, 5) ], dtype=object) arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32') arr_i8 = arr_i4.astype('int64') * 86400000 mask = np.array([False, True, False, False]) t32 = pa.date32() t64 = pa.date64() a32 = pa.Array.from_pandas(arr, type=t32) a64 = pa.Array.from_pandas(arr, type=t64) a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32) a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) # Test converting back to pandas colnames = ['date32', 'date64'] table = pa.Table.from_arrays([a32, a64], colnames) table_pandas = table.to_pandas() ex_values = (np.array( ['2017-04-03', '2017-04-04', '2017-04-04', '2017-04-05'], dtype='datetime64[D]').astype('datetime64[ns]')) ex_values[1] = pd.NaT.value expected_pandas = pd.DataFrame( { 'date32': ex_values, 'date64': ex_values }, columns=colnames) tm.assert_frame_equal(table_pandas, expected_pandas)
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_sql_types(sqlserver_table, sqlserver_con): table = sqlserver_table df = get_df() df.drop(["binary"], axis=1, inplace=True) wr.sqlserver.to_sql( df=df, con=sqlserver_con, table=table, schema="dbo", mode="overwrite", index=True, dtype={"iint32": "INTEGER"}, ) df = wr.sqlserver.read_sql_query(f"SELECT * FROM dbo.{table}", sqlserver_con) ensure_data_types(df, has_list=False) dfs = wr.sqlserver.read_sql_query( sql=f"SELECT * FROM dbo.{table}", con=sqlserver_con, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "ddouble": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False)
def _convert_to_parquet(self): import pyarrow as pa # from .utils import PandasPyArrowCursor pq_schema = [] GLUE_HIVE_TO_PQ_MAPPING = { 'bigint': pa.int64(), 'int': pa.int32(), 'integer': pa.int32(), 'smallint': pa.int32(), 'string': pa.utf8(), 'boolean': pa.bool_(), 'float': pa.float32(), 'double': pa.float64() } for e in self._metadata['StorageDescriptor']['Columns']: comment = json.loads(e.get('Comment', '{}')) writer_type = e['Type'] nullable = comment.get('nullable', True) and (not comment.get('is_pk', False)) if writer_type in GLUE_HIVE_TO_PQ_MAPPING: final_type = GLUE_HIVE_TO_PQ_MAPPING[writer_type] elif 'decimal' in writer_type: precision, scale = re.match(r'decimal\((\d+),(\d+)\)', writer_type).groups() final_type = pa.decimal128(int(precision), int(scale)) elif 'timestamp' in writer_type: unit = comment.get('unit', 'us') final_type = pa.timestamp(unit) elif 'date' in writer_type: final_type = pa.date32() else: raise NotImplementedError( '{} is not supported'.format(writer_type)) pq_schema.append(pa.field(e['Name'], final_type, nullable=nullable)) final_schema = pa.schema(pq_schema) # final_schema = PandasPyArrowCursor.make_pandas_meta(final_schema) return final_schema
def test_sequence_timestamp_from_int_with_unit(): data = [1] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert repr(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')" assert str(arr_s[0]) == "1970-01-01 00:00:01" arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert repr(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')" assert str(arr_ms[0]) == "1970-01-01 00:00:00.001000" arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert repr(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')" assert str(arr_us[0]) == "1970-01-01 00:00:00.000001" arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert repr(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')" assert str(arr_ns[0]) == "1970-01-01 00:00:00.000000001" with pytest.raises(pa.ArrowException): class CustomClass(): pass pa.array([1, CustomClass()], type=ns) pa.array([1, CustomClass()], type=pa.date32()) pa.array([1, CustomClass()], type=pa.date64())
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.map_(pa.string(), pa.int8()), pa.struct([pa.field('a', 'int8'), pa.field('b', 'string')]), pa.union([pa.field('a', pa.int8()), pa.field('b', pa.int16())], pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.int8()), pa.field('b', pa.int16())], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal128(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def _build_date_builder(name: str, part: str): if part == "date": unit = "day" def fn(st: time.struct_time) -> datetime.date: return datetime.date(st.tm_year, st.tm_mon, st.tm_mday) elif part == "dateweek": unit = "week" def fn(st: time.struct_time) -> datetime.date: return datetime.date.fromordinal( datetime.date(st.tm_year, st.tm_mon, st.tm_mday).toordinal() - st.tm_wday ) elif part == "datemonth": unit = "month" def fn(st: time.struct_time) -> datetime.date: return datetime.date(st.tm_year, st.tm_mon, 1) elif part == "datequarter": unit = "quarter" def fn(st: time.struct_time) -> datetime.date: return datetime.date( st.tm_year, [0, 1, 1, 1, 4, 4, 4, 7, 7, 7, 10, 10, 10][st.tm_mon], 1 ) else: unit = "year" def fn(st: time.struct_time) -> datetime.date: return datetime.date(st.tm_year, 1, 1) return pa.field(name, pa.date32(), metadata={"unit": unit}), fn
def test_date_objects_typed(self): arr = np.array([ date(2017, 4, 3), None, date(2017, 4, 4), date(2017, 4, 5)], dtype=object) arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32') arr_i8 = arr_i4.astype('int64') * 86400000 mask = np.array([False, True, False, False]) t32 = pa.date32() t64 = pa.date64() a32 = pa.array(arr, type=t32) a64 = pa.array(arr, type=t64) a32_expected = pa.array(arr_i4, mask=mask, type=t32) a64_expected = pa.array(arr_i8, mask=mask, type=t64) assert a32.equals(a32_expected) assert a64.equals(a64_expected) # Test converting back to pandas colnames = ['date32', 'date64'] table = pa.Table.from_arrays([a32, a64], colnames) table_pandas = table.to_pandas() ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04', '2017-04-05'], dtype='datetime64[D]') .astype('datetime64[ns]')) ex_values[1] = pd.NaT.value expected_pandas = pd.DataFrame({'date32': ex_values, 'date64': ex_values}, columns=colnames) tm.assert_frame_equal(table_pandas, expected_pandas)
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_multiple_factories(multisourcefs): src1 = ds.factory('/plain', filesystem=multisourcefs, format='parquet') src2 = ds.factory('/schema', filesystem=multisourcefs, format='parquet', partitioning=['week', 'color']) src3 = ds.factory('/hive', filesystem=multisourcefs, format='parquet', partitioning='hive') assembled = ds.dataset([src1, src2, src3]) assert isinstance(assembled, ds.Dataset) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('week', pa.int32()), ('month', pa.int32()), ('year', pa.int32()), ]) assert assembled.schema.equals(expected_schema)
def athena2pyarrow(dtype: str) -> pa.DataType: # pylint: disable=too-many-return-statements """Athena to PyArrow data types conversion.""" dtype = dtype.lower().replace(" ", "") if dtype == "tinyint": return pa.int8() if dtype == "smallint": return pa.int16() if dtype in ("int", "integer"): return pa.int32() if dtype == "bigint": return pa.int64() if dtype in ("float", "real"): return pa.float32() if dtype == "double": return pa.float64() if dtype == "boolean": return pa.bool_() if (dtype == "string") or dtype.startswith("char") or dtype.startswith("varchar"): return pa.string() if dtype == "timestamp": return pa.timestamp(unit="ns") if dtype == "date": return pa.date32() if dtype in ("binary" or "varbinary"): return pa.binary() if dtype.startswith("decimal") is True: precision, scale = dtype.replace("decimal(", "").replace(")", "").split(sep=",") return pa.decimal128(precision=int(precision), scale=int(scale)) if dtype.startswith("array") is True: return pa.list_(value_type=athena2pyarrow(dtype=dtype[6:-1]), list_size=-1) if dtype.startswith("struct") is True: return pa.struct([(f.split(":", 1)[0], athena2pyarrow(f.split(":", 1)[1])) for f in _split_struct(dtype[7:-1])]) if dtype.startswith("map") is True: parts: List[str] = _split_map(s=dtype[4:-1]) return pa.map_(athena2pyarrow(parts[0]), athena2pyarrow(parts[1])) raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}")
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] for case in date_types + time_types + timestamp_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_temporal(pa.int32())
def infer_schema(chunk): fields = [] columns = chunk.columns dtypes = chunk.dtypes i = 0 for column in columns: dtype = dtypes[i] if dtype.name == 'object': if column.endswith('date'): fields.append(pa.field(column, pa.date32())) else: fields.append(pa.field(column, pa.string())) else: fields.append(pa.field(column, pa.type_for_alias(dtype.name))) i += 1 schema = pa.schema(fields) return schema
def test_index_as_flat_series_date(): index1 = ExplicitSecondaryIndex( column="col", index_dct={ datetime.date(2017, 1, 2): ["part_1", "part_2"], datetime.date(2018, 2, 3): ["part_1"], }, dtype=pa.date32(), ) ser = index1.as_flat_series() ser = ser.sort_index() expected = pd.Series( ["part_1", "part_2", "part_1"], index=pd.Index( [ datetime.date(2017, 1, 2), datetime.date(2017, 1, 2), datetime.date(2018, 2, 3), ], name="col", ), name="partition", ) assert_series_equal(ser, expected)
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] for case in date_types + time_types + timestamp_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_temporal(pa.int32())
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) )
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([ pa.float16(), pa.float32(), pa.float64() ]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38) ) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([ pa.date32(), pa.date64() ]) time_types = st.sampled_from([ pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns') ]) timestamp_types = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones() ) temporal_types = st.one_of(date_types, time_types, timestamp_types)
for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i def test_fields_weakrefable(): field = pa.field('a', pa.int32()) wr = weakref.ref(field) assert wr() is not None del field assert wr() is None @pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64)]) def test_exact_primitive_types(t, check_func):
def test_array_from_numpy_datetimeD(): arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') result = pa.array(arr) expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) assert result.equals(expected)
('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected): assert get_logical_type(type) == expected
def test_date32_overflow(): # Overflow data3 = [2**32, None] with pytest.raises((OverflowError, pa.ArrowException)): pa.array(data3, type=pa.date32())
signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=0, max_value=38), scale=st.integers(min_value=0, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]) timestamp_types = st.builds(pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones()) temporal_types = st.one_of(date_types, time_types, timestamp_types) primitive_types = st.one_of(null_type, bool_type, binary_type, string_type, numeric_types, temporal_types) metadata = st.dictionaries(st.text(), st.text())
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def test_array_from_numpy_datetimeD(): arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') result = pa.array(arr) expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) assert result.equals(expected)
def test_sequence_date(): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.date32() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26) @pytest.mark.parametrize('input', [(pa.date32(), [10957, None]), (pa.date64(), [10957 * 86400000, None])]) def test_sequence_explicit_types(input): t, ex_values = input data = [datetime.date(2000, 1, 1), None] arr = pa.array(data, type=t) arr2 = pa.array(ex_values, type=t) for x in [arr, arr2]: assert len(x) == 2 assert x.type == t assert x.null_count == 1 assert x[0].as_py() == datetime.date(2000, 1, 1) assert x[1] is pa.NA
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def test_datetime_subclassing(): data = [ MyDate(2007, 7, 13), ] date_type = pa.date32() arr_date = pa.array(data, type=date_type) assert len(arr_date) == 1 assert arr_date.type == date_type assert arr_date[0].as_py() == datetime.date(2007, 7, 13) data = [ MyDatetime(2007, 7, 13, 1, 23, 34, 123456), ] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 0) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) data = [ MyTimedelta(123, 456, 1002), ] s = pa.duration('s') ms = pa.duration('ms') us = pa.duration('us') arr_s = pa.array(data) assert len(arr_s) == 1 assert arr_s.type == us assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002) arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.timedelta(123, 456) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002)
'''Asset definitions for the simple_lakehouse example.''' import pandas as pd from lakehouse import Column, computed_table, source_table from pyarrow import date32, float64, string sfo_q2_weather_sample_table = source_table( storage_key='filesystem', path=('data', ), columns=[Column('tmpf', float64()), Column('valid_date', string())], ) @computed_table( storage_key='filesystem', input_assets=[sfo_q2_weather_sample_table], columns=[Column('valid_date', date32()), Column('max_tmpf', float64())], ) def daily_temperature_highs_table( sfo_q2_weather_sample: pd.DataFrame) -> pd.DataFrame: '''Computes the temperature high for each day''' sfo_q2_weather_sample['valid_date'] = pd.to_datetime( sfo_q2_weather_sample['valid']) return sfo_q2_weather_sample.groupby('valid_date').max().rename( columns={'tmpf': 'max_tmpf'})
arr = pa.array(narr) assert narr.dtype == arr.to_numpy().dtype np.testing.assert_array_equal(narr, arr.to_numpy()) np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2**63], type=pa.uint64()) expected = pa.array(np.array([2**63], dtype='u8')) assert arr.equals(expected)
def test_sql(redshift_table, postgresql_table, mysql_table, databases_parameters, db_type): if db_type == "postgresql": table = postgresql_table elif db_type == "mysql": table = mysql_table else: table = redshift_table df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}", echo=False) index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=index, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=databases_parameters[db_type]["host"], port=databases_parameters[db_type]["port"], database=databases_parameters[db_type]["database"], user=databases_parameters["user"], password=databases_parameters["password"], echo=False, ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = databases_parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table=table, schema=schema, index_col="index") assert df.shape == (3, 1)
def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column(arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories) elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype='int8') else: sbuf = np.empty(0, dtype='int8') # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype='int32') # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype='int8') data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type) data = as_column(pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if pd.api.types.is_categorical_dtype(dtype): data = as_column(pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null) else: if dtype is None: np_type = None else: np_type = np.dtype(dtype) data = as_column(np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null) return data
def test_schema_pyarrow_types(): field_name = "column1" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "int", "bitWidth": 8, "isSigned": True}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.int8() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_no_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "timestamp"}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("ns") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_with_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "timestamp", "unit": "MICROSECOND"}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("us") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "date_with_day_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "date", "unit": "DAY"}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.date32() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "simple_list" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "list"}, "children": [{"type": {"name": "int", "bitWidth": 32, "isSigned": True}}], } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.list_( pyarrow.field("element", pyarrow.int32()) ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "dictionary" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "int", "bitWidth": 32, "isSigned": True}, "children": [], "dictionary": { "id": 0, "indexType": {"name": "int", "bitWidth": 16, "isSigned": True}, }, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "struct_array" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "list"}, "children": [], "dictionary": { "id": 0, "indexType": {"name": "int", "bitWidth": 32, "isSigned": True}, }, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_( pyarrow.int32(), pyarrow.list_( pyarrow.field( "element", pyarrow.struct( [pyarrow.field("val", pyarrow.int32(), False, metadata)] ), ) ), ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "simple_dictionary" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "metadata": {"metadata_k": "metadata_v"}, "nullable": False, "type": {"name": "dictionary"}, "dictionary": {"indexType": {"type": {"name": "int", "bitWidth": 8}}}, "children": [{"type": {"name": "int", "bitWidth": 32}}], } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "type": {"name": "struct"}, "children": [ { "name": "x", "type": {"name": "int", "bitWidth": 64}, "nullable": True, "metadata": {}, } ], "metadata": {"metadata_k": "metadata_v"}, "nullable": False, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.struct( [pyarrow.field("x", pyarrow.int64(), True, {})] ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False
"INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", } if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric # The exact decimal's scale and precision are not important, as only
from pandas import DataFrame as PandasDF from pyarrow import date32, float64, string from pyspark.sql import DataFrame as SparkDF from pyspark.sql import Window from pyspark.sql import functions as f sfo_q2_weather_sample_table = source_table( path=("sfo_q2_weather_sample", ), columns=[Column("tmpf", float64()), Column("valid_date", string())], ) @computed_table( input_assets=[sfo_q2_weather_sample_table], columns=[Column("valid_date", date32()), Column("max_tmpf", float64())], ) def daily_temperature_highs_table(sfo_q2_weather_sample: PandasDF) -> PandasDF: """Computes the temperature high for each day""" sfo_q2_weather_sample["valid_date"] = pd.to_datetime( sfo_q2_weather_sample["valid"]) return sfo_q2_weather_sample.groupby("valid_date").max().rename( columns={"tmpf": "max_tmpf"}) @computed_table( input_assets=[daily_temperature_highs_table], columns=[Column("valid_date", date32()), Column("max_tmpf", float64())], )
def test_sequence_date(): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pa.array(data) assert len(arr) == 4 assert arr.type == pa.date32() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26) @pytest.mark.parametrize('input', [(pa.date32(), [10957, None]), (pa.date64(), [10957 * 86400000, None])]) def test_sequence_explicit_types(input): t, ex_values = input data = [datetime.date(2000, 1, 1), None] arr = pa.array(data, type=t) arr2 = pa.array(ex_values, type=t) for x in [arr, arr2]: assert len(x) == 2 assert x.type == t assert x.null_count == 1 assert x[0].as_py() == datetime.date(2000, 1, 1) assert x[1] is pa.NA
(pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'), (pa.date32(), '{"name":"date","unit":"DAY"}'), (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'), (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'), (pa.string(), '{"name":"utf8"}'), (pa.binary(), '{"name":"binary"}'), (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'), # TODO(ARROW-2609): complex types that have children # pa.list_(pa.int32()), # pa.struct([pa.field('a', pa.int32()), # pa.field('b', pa.int8()), # pa.field('c', pa.string())]), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # TODO: DictionaryType requires a vector in the type
def dataframe_with_lists(include_index=False, parquet_compatible=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. parquet_compatible: bool Exclude types not supported by parquet """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"รค"], None, [u"1"], [u"1", u"2", u"3"], [], ] date_data = [ [], [date(2018, 1, 1), date(2032, 12, 30)], [date(2000, 6, 7)], None, [date(1969, 6, 9), date(1972, 7, 3)] ] time_data = [ [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], [], [time(22, 5, 59)], None, [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] ] temporal_pairs = [ (pa.date32(), date_data), (pa.date64(), date_data), (pa.time32('s'), time_data), (pa.time32('ms'), time_data), (pa.time64('us'), time_data) ] if not parquet_compatible: temporal_pairs += [ (pa.time64('ns'), time_data), ] for value_type, data in temporal_pairs: field_name = '{}_list'.format(value_type) field_type = pa.list_(value_type) field = pa.field(field_name, field_type) fields.append(field) arrays[field_name] = data if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_date32_overflow(): # Overflow data3 = [2**32, None] with pytest.raises(pa.ArrowException): pa.array(data3, type=pa.date32())
}, "timestamp": { "type": "long", "logicalType": "timestamp-micros" }, } # This dictionary is duplicated in bigquery/google/cloud/bigquery/_pandas_helpers.py # When modifying it be sure to update it there as well. BQ_TO_ARROW_TYPES = { "int64": pyarrow.int64(), "float64": pyarrow.float64(), "bool": pyarrow.bool_(), "numeric": pyarrow.decimal128(38, 9), "string": pyarrow.utf8(), "bytes": pyarrow.binary(), "date": pyarrow.date32(), # int32 days since epoch "datetime": pyarrow.timestamp("us"), "time": pyarrow.time64("us"), "timestamp": pyarrow.timestamp("us", tz="UTC"), } SCALAR_COLUMNS = [ { "name": "int_col", "type": "int64" }, { "name": "float_col", "type": "float64" }, { "name": "num_col",
pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ])