def test_timestamp_overflow(self, duckdb_cursor): if not can_run: return data = (pa.array([9223372036854775807], pa.timestamp('s')), pa.array([9223372036854775807], pa.timestamp('ms')), pa.array([9223372036854775807], pa.timestamp('us'))) arrow_table = pa.Table.from_arrays([data[0], data[1], data[2]], ['a', 'b', 'c']) arrow_from_duck = duckdb.from_arrow_table(arrow_table).arrow() assert (arrow_from_duck['a'] == arrow_table['a']) assert (arrow_from_duck['b'] == arrow_table['b']) assert (arrow_from_duck['c'] == arrow_table['c']) with pytest.raises(Exception): duck_rel = duckdb.from_arrow_table(arrow_table) res = duck_rel.project('a::TIMESTAMP_US') res.fetchone() with pytest.raises(Exception): duck_rel = duckdb.from_arrow_table(arrow_table) res = duck_rel.project('b::TIMESTAMP_US') res.fetchone() with pytest.raises(Exception): duck_rel = duckdb.from_arrow_table(arrow_table) res = duck_rel.project('c::TIMESTAMP_NS') res.fetchone()
def test_dictionary_null_index(self,duckdb_cursor): if not can_run: return indices = pa.array([None, 1, 0, 1, 2, 1, 0, 2]) dictionary = pa.array([10, 100, None]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array],['a']) rel = duckdb.from_arrow_table(arrow_table) assert rel.execute().fetchall() == [(None,), (100,), (10,), (100,), (None,), (100,), (10,), (None,)] indices = pa.array([None, 1, None, 1, 2, 1, 0]) dictionary = pa.array([10, 100, 100]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array],['a']) rel = duckdb.from_arrow_table(arrow_table) print (rel.execute().fetchall()) assert rel.execute().fetchall() == [(None,), (100,), (None,), (100,), (100,), (100,), (10,)] # Test Big Vector indices_list = [None, 1, None, 1, 2, 1, 0] indices = pa.array(indices_list * 1000) dictionary = pa.array([10, 100, 100]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array],['a']) rel = duckdb.from_arrow_table(arrow_table) result = [(None,), (100,), (None,), (100,), (100,), (100,), (10,)] * 1000 assert rel.execute().fetchall() == result #Table with dictionary and normal array arrow_table = pa.Table.from_arrays([dict_array,indices],['a','b']) rel = duckdb.from_arrow_table(arrow_table) result = [(None,None), (100,1), (None,None), (100,1), (100,2), (100,1), (10,0)] * 1000 assert rel.execute().fetchall() == result
def test_max_times(self, duckdb_cursor): if not can_run: return data = pa.array([2147483647000000], type=pa.time64('us')) result = pa.Table.from_arrays([data], ['a']) #Max Sec data = pa.array([2147483647], type=pa.time32('s')) arrow_table = pa.Table.from_arrays([data], ['a']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == result['a']) #Max MSec data = pa.array([2147483647000], type=pa.time64('us')) result = pa.Table.from_arrays([data], ['a']) data = pa.array([2147483647], type=pa.time32('ms')) arrow_table = pa.Table.from_arrays([data], ['a']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == result['a']) #Max NSec data = pa.array([9223372036854774], type=pa.time64('us')) result = pa.Table.from_arrays([data], ['a']) data = pa.array([9223372036854774000], type=pa.time64('ns')) arrow_table = pa.Table.from_arrays([data], ['a']) rel = duckdb.from_arrow_table(arrow_table).arrow() print(rel['a']) print(result['a']) assert (rel['a'] == result['a'])
def test_dictionary(self, duckdb_cursor): if not can_run: return indices = pa.array([0, 1, 0, 1, 2, 1, 0, 2]) dictionary = pa.array([10, 100, None]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow_table(arrow_table) assert rel.execute().fetchall() == [(10, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ), (None, )] # Bigger than Vector Size indices_list = [0, 1, 0, 1, 2, 1, 0, 2, 3] * 10000 indices = pa.array(indices_list) dictionary = pa.array([10, 100, None, 999999]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow_table(arrow_table) result = [(10, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ), (None, ), (999999, )] * 10000 assert rel.execute().fetchall() == result #Table with dictionary and normal array arrow_table = pa.Table.from_arrays( [dict_array, pa.array(indices_list)], ['a', 'b']) rel = duckdb.from_arrow_table(arrow_table) result = [(10, 0), (100, 1), (10, 0), (100, 1), (None, 2), (100, 1), (10, 0), (None, 2), (999999, 3)] * 10000 assert rel.execute().fetchall() == result
def test_list_types(self, duckdb_cursor): if not can_run: return #Large Lists data = pyarrow.array([[1], None, [2]], type=pyarrow.large_list(pyarrow.int64())) arrow_table = pa.Table.from_arrays([data], ['a']) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [([1], ), (None, ), ([2], )] #Fixed Size Lists data = pyarrow.array([[1], None, [2]], type=pyarrow.list_(pyarrow.int64(), 1)) arrow_table = pa.Table.from_arrays([data], ['a']) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [([1], ), (None, ), ([2], )] #Complex nested structures with different list types data = [ pyarrow.array([[1], None, [2]], type=pyarrow.list_(pyarrow.int64(), 1)), pyarrow.array([[1], None, [2]], type=pyarrow.large_list(pyarrow.int64())), pyarrow.array([[1, 2, 3], None, [2, 1]], type=pyarrow.list_(pyarrow.int64())) ] arrow_table = pa.Table.from_arrays([data[0], data[1], data[2]], ['a', 'b', 'c']) rel = duckdb.from_arrow_table(arrow_table) res = rel.project('a').execute().fetchall() assert res == [([1], ), (None, ), ([2], )] res = rel.project('b').execute().fetchall() assert res == [([1], ), (None, ), ([2], )] res = rel.project('c').execute().fetchall() assert res == [([1, 2, 3], ), (None, ), ([2, 1], )] #Struct Holding different List Types struct = [ pa.StructArray.from_arrays(data, ['fixed', 'large', 'normal']) ] arrow_table = pa.Table.from_arrays(struct, ['a']) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [({ 'fixed': [1], 'large': [1], 'normal': [1, 2, 3] }, ), ({ 'fixed': None, 'large': None, 'normal': None }, ), ({ 'fixed': [2], 'large': [2], 'normal': [2, 1] }, )]
def test_unsigned_roundtrip(self,duckdb_cursor): if not can_run: return parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','unsigned.parquet') data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \ type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\ pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64())) tbl = pyarrow.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d']) pyarrow.parquet.write_table(tbl, parquet_filename) cols = 'a, b, c, d' unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename) unsigned_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(cols).arrow() rel_from_duckdb.validate(full=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute("select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq") arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_dictionary_index_types(self, duckdb_cursor): if not can_run: return indices_list = [None, 1, None, 1, 2, 1, 0] dictionary = pa.array([10, 100, 100], type=pyarrow.uint8()) index_types = [] index_types.append(pa.array(indices_list * 10000, type=pyarrow.uint8())) index_types.append( pa.array(indices_list * 10000, type=pyarrow.uint16())) index_types.append( pa.array(indices_list * 10000, type=pyarrow.uint32())) index_types.append( pa.array(indices_list * 10000, type=pyarrow.uint64())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int8())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int16())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int32())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int64())) for index_type in index_types: dict_array = pa.DictionaryArray.from_arrays(index_type, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow_table(arrow_table) result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ), (10, )] * 10000 assert rel.execute().fetchall() == result
def compare_results(query): true_answer = duckdb.query(query).fetchall() t = duckdb.query(query).arrow() from_arrow = duckdb.from_arrow_table( duckdb.query(query).arrow()).fetchall() assert true_answer == from_arrow
def test_date_null(self, duckdb_cursor): if not can_run: return data = (pa.array([None], type=pa.date64()),pa.array([None], type=pa.date32())) arrow_table = pa.Table.from_arrays([data[0],data[1]],['a','b']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == arrow_table['b']) assert (rel['b'] == arrow_table['b'])
def test_duration_null(self, duckdb_cursor): if not can_run: return data = (pa.array([None], type=pa.duration('ns')),pa.array([None], type=pa.duration('us')),pa.array([None], pa.duration('ms')),pa.array([None], pa.duration('s'))) arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == arrow_table['c']) assert (rel['b'] == arrow_table['c']) assert (rel['c'] == arrow_table['c']) assert (rel['d'] == arrow_table['c'])
def test_duration_overflow(self, duckdb_cursor): if not can_run: return # Only seconds can overflow data = pa.array([9223372036854775807], pa.duration('s')) arrow_table = pa.Table.from_arrays([data],['a']) with pytest.raises(Exception): arrow_from_duck = duckdb.from_arrow_table(arrow_table).arrow()
def test_max_date(self, duckdb_cursor): if not can_run: return data = (pa.array([2147483647], type=pa.date32()),pa.array([2147483647], type=pa.date32())) result = pa.Table.from_arrays([data[0],data[1]],['a','b']) data = (pa.array([2147483647*(1000*60*60*24)], type=pa.date64()),pa.array([2147483647], type=pa.date32())) arrow_table = pa.Table.from_arrays([data[0],data[1]],['a','b']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == result['a']) assert (rel['b'] == result['b'])
def test_dictionary_batches(self,duckdb_cursor): if not can_run: return indices_list = [None, 1, None, 1, 2, 1, 0] indices = pa.array(indices_list * 10000) dictionary = pa.array([10, 100, 100]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array],['a']) batch_arrow_table = pyarrow.Table.from_batches(arrow_table.to_batches(10)) rel = duckdb.from_arrow_table(batch_arrow_table) result = [(None,), (100,), (None,), (100,), (100,), (100,), (10,)] * 10000 assert rel.execute().fetchall() == result #Table with dictionary and normal array arrow_table = pa.Table.from_arrays([dict_array,indices],['a','b']) batch_arrow_table = pyarrow.Table.from_batches(arrow_table.to_batches(10)) rel = duckdb.from_arrow_table(batch_arrow_table) result = [(None,None), (100,1), (None,None), (100,1), (100,2), (100,1), (10,0)] * 10000 assert rel.execute().fetchall() == result
def test_large_string_type(self, duckdb_cursor): if not can_run: return schema = pa.schema([("data", pa.large_string())]) inputs = [pa.array(["foo", "baaaar", "b"], type=pa.large_string())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [('foo', ), ('baaaar', ), ('b', )]
def test_dictionary_strings(self,duckdb_cursor): if not can_run: return indices_list = [None, 0, 1, 2, 3, 4, None] indices = pa.array(indices_list * 1000) dictionary = pa.array(['Matt Daaaaaaaaamon', 'Alec Baldwin', 'Sean Penn', 'Tim Robbins', 'Samuel L. Jackson']) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array],['a']) rel = duckdb.from_arrow_table(arrow_table) result = [(None,), ('Matt Daaaaaaaaamon',), ( 'Alec Baldwin',), ('Sean Penn',), ('Tim Robbins',), ('Samuel L. Jackson',), (None,)] * 1000 assert rel.execute().fetchall() == result
def test_binary_types(self, duckdb_cursor): if not can_run: return # Fixed Size Binary arrow_table = create_binary_table(pa.binary(3)) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [(b"foo", ), (b"bar", ), (b"baz", )] # Normal Binary arrow_table = create_binary_table(pa.binary()) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [(b"foo", ), (b"bar", ), (b"baz", )] # Large Binary arrow_table = create_binary_table(pa.large_binary()) rel = duckdb.from_arrow_table(arrow_table) res = rel.execute().fetchall() assert res == [(b"foo", ), (b"bar", ), (b"baz", )]
def test_dictionary_timestamps(self,duckdb_cursor): if not can_run: return indices_list = [None, 0, 1, 2, None] indices = pa.array(indices_list * 1000) dictionary = pa.array([Timestamp(year=2001, month=9, day=25),Timestamp(year=2006, month=11, day=14),Timestamp(year=2012, month=5, day=15),Timestamp(year=2018, month=11, day=2)]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array],['a']) rel = duckdb.from_arrow_table(arrow_table) print (rel.execute().fetchall()) result = [(None,), (datetime.datetime(2001, 9, 25, 0, 0),), (datetime.datetime(2006, 11, 14, 0, 0),), (datetime.datetime(2012, 5, 15, 0, 0),), (None,)] * 1000 assert rel.execute().fetchall() == result
def test_arrow(self, duckdb_cursor): if not can_run: return parquet_filename = 'userdata1.parquet' urllib.request.urlretrieve( 'https://github.com/cwida/duckdb-data/releases/download/v1.0/userdata1.parquet', parquet_filename) cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' # TODO timestamp userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) userdata_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(userdata_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) # batched version, lets use various values for batch size for i in [7, 51, 99, 100, 101, 500, 1000, 2000]: userdata_parquet_table2 = pyarrow.Table.from_batches( userdata_parquet_table.to_batches(i)) assert userdata_parquet_table.equals(userdata_parquet_table2, check_metadata=True) rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project( cols).arrow() rel_from_arrow2.validate(full=True) assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute( "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq" ) arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_parallel_types_and_different_batches(self, duckdb_cursor): if not can_run: return duckdb_conn = duckdb.connect() duckdb_conn.execute("PRAGMA threads=4") duckdb_conn.execute("PRAGMA force_parallelism") parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'userdata1.parquet') cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) for i in [7, 51, 99, 100, 101, 500, 1000, 2000]: data = (pyarrow.array(np.arange(3, 7), type=pyarrow.int32())) tbl = pyarrow.Table.from_arrays([data], ['a']) rel_id = duckdb.from_arrow_table(tbl) userdata_parquet_table2 = pyarrow.Table.from_batches( userdata_parquet_table.to_batches(i)) rel = duckdb.from_arrow_table(userdata_parquet_table2) result = rel.filter("first_name=\'Jose\' and salary > 134708.82" ).aggregate('count(*)') assert (result.execute().fetchone()[0] == 4)
def test_timestamp_types(self, duckdb_cursor): if not can_run: return data = (pa.array([datetime.datetime.now()], type=pa.timestamp('ns')), pa.array([datetime.datetime.now()], type=pa.timestamp('us')), pa.array([datetime.datetime.now()], pa.timestamp('ms')), pa.array([datetime.datetime.now()], pa.timestamp('s'))) arrow_table = pa.Table.from_arrays( [data[0], data[1], data[2], data[3]], ['a', 'b', 'c', 'd']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == arrow_table['a']) assert (rel['b'] == arrow_table['b']) assert (rel['c'] == arrow_table['c']) assert (rel['d'] == arrow_table['d'])
def test_parallel_fewer_batches_than_threads(self, duckdb_cursor): if not can_run: return duckdb_conn = duckdb.connect() duckdb_conn.execute("PRAGMA threads=4") duckdb_conn.execute("PRAGMA force_parallelism") data = (pyarrow.array(np.random.randint(800, size=1000), type=pyarrow.int32())) tbl = pyarrow.Table.from_batches( pyarrow.Table.from_arrays([data], ['a']).to_batches(2)) rel = duckdb.from_arrow_table(tbl) # Also test multiple reads assert ( rel.aggregate("(count(a))::INT").execute().fetchone()[0] == 1000)
def test_multiple_queries_same_relation(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'userdata1.parquet') cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) userdata_parquet_table.validate(full=True) rel = duckdb.from_arrow_table(userdata_parquet_table) assert (rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] == 149005) assert (rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] == 149005)
def test_null_type(self, duckdb_cursor): if not can_run: return schema = pa.schema([("data", pa.null())]) inputs = [pa.array([None, None, None], type=pa.null())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) duckdb_conn = duckdb.connect() duckdb_conn.register("testarrow", arrow_table) rel = duckdb.from_arrow_table(arrow_table).arrow() # We turn it to an array of int32 nulls schema = pa.schema([("data", pa.int32())]) inputs = [pa.array([None, None, None], type=pa.null())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) assert rel['data'] == arrow_table['data']
def test_from_arrow(self, duckdb_cursor): try: import pyarrow as pa except: return conn = duckdb.connect() conn.execute("create table t (a integer)") conn.execute("insert into t values (1)") test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4]}) test_arrow = pa.Table.from_pandas(test_df) rel = duckdb.from_arrow_table(test_arrow, connection=conn) assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] == (1,) rel = duckdb.arrow(test_arrow, connection=conn) assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] == (1,)
def run_arrow(commit_hash,column_name,experiment_name,duck_con): import statistics,duckdb duck_to_arrow = [] arrow_to_duck = [] for i in range(6): duck_con.execute("select " + column_name + " from t;") start_time = time.time() result = duck_con.fetch_arrow_table() time_duck_to_arrow = time.time() - start_time start_time = time.time() result = duckdb.from_arrow_table(result) time_arrow_to_duck = time.time() - start_time if i!= 0: duck_to_arrow.append(time_duck_to_arrow) arrow_to_duck.append(time_arrow_to_duck) (benchmark_id, groupname) = insert_benchmark_info('duckdb -> arrow ' + experiment_name,'arrow_integration','') c.execute("INSERT INTO timings (benchmark_id, hash, success, median) VALUES (?, ?, ?, ?)", (benchmark_id, commit_hash, True, statistics.median(duck_to_arrow))) (benchmark_id, groupname) = insert_benchmark_info('arrow -> duckdb ' + experiment_name,'arrow_integration','') c.execute("INSERT INTO timings (benchmark_id, hash, success, median) VALUES (?, ?, ?, ?)", (benchmark_id, commit_hash, True, statistics.median(arrow_to_duck))) con.commit()