def test_parquet_roundtrip(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'userdata1.parquet') cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' # TODO timestamp userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) userdata_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(userdata_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) # batched version, lets use various values for batch size for i in [7, 51, 99, 100, 101, 500, 1000, 2000]: userdata_parquet_table2 = pyarrow.Table.from_batches( userdata_parquet_table.to_batches(i)) assert userdata_parquet_table.equals(userdata_parquet_table2, check_metadata=True) rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project( cols).arrow() rel_from_arrow2.validate(full=True) assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)
def test_arrow(self, duckdb_cursor): if not can_run: return parquet_filename = 'userdata1.parquet' urllib.request.urlretrieve( 'https://github.com/cwida/duckdb-data/releases/download/v1.0/userdata1.parquet', parquet_filename) cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' # TODO timestamp userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) userdata_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(userdata_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) # batched version, lets use various values for batch size for i in [7, 51, 99, 100, 101, 500, 1000, 2000]: userdata_parquet_table2 = pyarrow.Table.from_batches( userdata_parquet_table.to_batches(i)) assert userdata_parquet_table.equals(userdata_parquet_table2, check_metadata=True) rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project( cols).arrow() rel_from_arrow2.validate(full=True) assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute( "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq" ) arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_filter_pushdown_2145(self, duckdb_cursor): if not can_run: return date1 = pd.date_range("2018-01-01", "2018-12-31", freq="B") df1 = pd.DataFrame(np.random.randn(date1.shape[0], 5), columns=list("ABCDE")) df1["date"] = date1 date2 = pd.date_range("2019-01-01", "2019-12-31", freq="B") df2 = pd.DataFrame(np.random.randn(date2.shape[0], 5), columns=list("ABCDE")) df2["date"] = date2 pq.write_table(pa.table(df1), "data1.parquet") pq.write_table(pa.table(df2), "data2.parquet") table = pq.ParquetDataset(["data1.parquet", "data2.parquet"]).read() con = duckdb.connect() con.register_arrow("testarrow", table) output_df = duckdb.arrow(table).filter("date > '2019-01-01'").df() expected_df = duckdb.from_parquet("data*.parquet").filter( "date > '2019-01-01'").df() pd.testing.assert_frame_equal(expected_df, output_df) os.remove("data1.parquet") os.remove("data2.parquet")
def test_unsigned_roundtrip(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'unsigned.parquet') cols = 'a, b, c, d' unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename) unsigned_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute( "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq" ) arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_unsigned_roundtrip(self,duckdb_cursor): if not can_run: return parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','unsigned.parquet') data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \ type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\ pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64())) tbl = pyarrow.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d']) pyarrow.parquet.write_table(tbl, parquet_filename) cols = 'a, b, c, d' unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename) unsigned_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(cols).arrow() rel_from_duckdb.validate(full=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute("select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq") arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_arrow(self, duckdb_cursor): if not can_run: return parquet_filename = 'userdata1.parquet' urllib.request.urlretrieve( 'https://github.com/cwida/duckdb-data/releases/download/v1.0/userdata1.parquet', parquet_filename) cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' # TODO timestamp userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) userdata_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(userdata_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) # batched version, lets use various values for batch size for i in [7, 51, 99, 100, 101, 500, 1000, 2000]: userdata_parquet_table2 = pyarrow.Table.from_batches( userdata_parquet_table.to_batches(i)) assert userdata_parquet_table.equals(userdata_parquet_table2, check_metadata=True) rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project( cols).arrow() rel_from_arrow2.validate(full=True) assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)
def test_from_arrow(self, duckdb_cursor): try: import pyarrow as pa except: return conn = duckdb.connect() conn.execute("create table t (a integer)") conn.execute("insert into t values (1)") test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4]}) test_arrow = pa.Table.from_pandas(test_df) rel = duckdb.from_arrow_table(test_arrow, connection=conn) assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] == (1,) rel = duckdb.arrow(test_arrow, connection=conn) assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] == (1,)