def test_dictionary(self, duckdb_cursor): if not can_run: return indices = pa.array([0, 1, 0, 1, 2, 1, 0, 2]) dictionary = pa.array([10, 100, None]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) assert rel.execute().fetchall() == [(10, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ), (None, )] # Bigger than Vector Size indices_list = [0, 1, 0, 1, 2, 1, 0, 2, 3] * 10000 indices = pa.array(indices_list) dictionary = pa.array([10, 100, None, 999999]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) result = [(10, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ), (None, ), (999999, )] * 10000 assert rel.execute().fetchall() == result #Table with dictionary and normal array arrow_table = pa.Table.from_arrays( [dict_array, pa.array(indices_list)], ['a', 'b']) rel = duckdb.from_arrow(arrow_table) result = [(10, 0), (100, 1), (10, 0), (100, 1), (None, 2), (100, 1), (10, 0), (None, 2), (999999, 3)] * 10000 assert rel.execute().fetchall() == result
def test_max_times(self, duckdb_cursor): if not can_run: return data = pa.array([2147483647000000], type=pa.time64('us')) result = pa.Table.from_arrays([data],['a']) #Max Sec data = pa.array([2147483647], type=pa.time32('s')) arrow_table = pa.Table.from_arrays([data],['a']) rel = duckdb.from_arrow(arrow_table).arrow() assert (rel['a'] == result['a']) #Max MSec data = pa.array([2147483647000], type=pa.time64('us')) result = pa.Table.from_arrays([data],['a']) data = pa.array([2147483647], type=pa.time32('ms')) arrow_table = pa.Table.from_arrays([data],['a']) rel = duckdb.from_arrow(arrow_table).arrow() assert (rel['a'] == result['a']) #Max NSec data = pa.array([9223372036854774], type=pa.time64('us')) result = pa.Table.from_arrays([data],['a']) data = pa.array([9223372036854774000], type=pa.time64('ns')) arrow_table = pa.Table.from_arrays([data],['a']) rel = duckdb.from_arrow(arrow_table).arrow() print (rel['a']) print (result['a']) assert (rel['a'] == result['a'])
def test_timestamp_overflow(self, duckdb_cursor): if not can_run: return data = (pa.array([9223372036854775807], pa.timestamp('s')), pa.array([9223372036854775807], pa.timestamp('ms')), pa.array([9223372036854775807], pa.timestamp('us'))) arrow_table = pa.Table.from_arrays([data[0], data[1], data[2]], ['a', 'b', 'c']) arrow_from_duck = duckdb.from_arrow(arrow_table).arrow() assert (arrow_from_duck['a'] == arrow_table['a']) assert (arrow_from_duck['b'] == arrow_table['b']) assert (arrow_from_duck['c'] == arrow_table['c']) with pytest.raises(Exception): duck_rel = duckdb.from_arrow(arrow_table) res = duck_rel.project('a::TIMESTAMP_US') res.fetchone() with pytest.raises(Exception): duck_rel = duckdb.from_arrow(arrow_table) res = duck_rel.project('b::TIMESTAMP_US') res.fetchone() with pytest.raises(Exception): duck_rel = duckdb.from_arrow(arrow_table) res = duck_rel.project('c::TIMESTAMP_NS') res.fetchone()
def test_unsigned_roundtrip(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'unsigned.parquet') cols = 'a, b, c, d' unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename) unsigned_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute( "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq" ) arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_parallel_reader_default_conn(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'userdata1.parquet') userdata_parquet_dataset = pyarrow.dataset.dataset([ parquet_filename, parquet_filename, parquet_filename, ], format="parquet") batches = [r for r in userdata_parquet_dataset.to_batches()] reader = pyarrow.dataset.Scanner.from_batches( batches, userdata_parquet_dataset.schema).to_reader() rel = duckdb.from_arrow(reader) assert rel.filter("first_name=\'Jose\' and salary > 134708.82" ).aggregate('count(*)').execute().fetchone()[0] == 12 # The reader is already consumed so this should be 0 assert rel.filter("first_name=\'Jose\' and salary > 134708.82" ).aggregate('count(*)').execute().fetchone()[0] == 0
def test_dictionary_index_types(self, duckdb_cursor): if not can_run: return indices_list = [None, 1, None, 1, 2, 1, 0] dictionary = pa.array([10, 100, 100], type=pyarrow.uint8()) index_types = [] index_types.append(pa.array(indices_list * 10000, type=pyarrow.uint8())) index_types.append( pa.array(indices_list * 10000, type=pyarrow.uint16())) index_types.append( pa.array(indices_list * 10000, type=pyarrow.uint32())) index_types.append( pa.array(indices_list * 10000, type=pyarrow.uint64())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int8())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int16())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int32())) index_types.append(pa.array(indices_list * 10000, type=pyarrow.int64())) for index_type in index_types: dict_array = pa.DictionaryArray.from_arrays(index_type, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ), (10, )] * 10000 assert rel.execute().fetchall() == result
def test_timestamp_timezone_overflow(self, duckdb_cursor): if not can_run: return precisions = ['s', 'ms'] current_time = 9223372036854775807 for precision in precisions: with pytest.raises(Exception, match='Could not convert'): arrow_table = generate_table(current_time, precision, 'UTC') res_utc = duckdb.from_arrow(arrow_table).execute().fetchall()
def test_date_null(self, duckdb_cursor): if not can_run: return data = (pa.array([None], type=pa.date64()), pa.array([None], type=pa.date32())) arrow_table = pa.Table.from_arrays([data[0], data[1]], ['a', 'b']) rel = duckdb.from_arrow(arrow_table).arrow() assert (rel['a'] == arrow_table['b']) assert (rel['b'] == arrow_table['b'])
def test_duration_overflow(self, duckdb_cursor): if not can_run: return # Only seconds can overflow data = pa.array([9223372036854775807], pa.duration('s')) arrow_table = pa.Table.from_arrays([data], ['a']) with pytest.raises(Exception): arrow_from_duck = duckdb.from_arrow(arrow_table).arrow()
def test_time_null(self, duckdb_cursor): if not can_run: return data = (pa.array([None], type=pa.time32('s')),pa.array([None], type=pa.time32('ms')),pa.array([None], pa.time64('us')),pa.array([None], pa.time64('ns'))) arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d']) rel = duckdb.from_arrow(arrow_table).arrow() assert (rel['a'] == arrow_table['c']) assert (rel['b'] == arrow_table['c']) assert (rel['c'] == arrow_table['c']) assert (rel['d'] == arrow_table['c'])
def test_large_string_type(self, duckdb_cursor): if not can_run: return schema = pa.schema([("data", pa.large_string())]) inputs = [pa.array(["foo", "baaaar", "b"], type=pa.large_string())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) rel = duckdb.from_arrow(arrow_table) res = rel.execute().fetchall() assert res == [('foo', ), ('baaaar', ), ('b', )]
def test_multiple_queries_same_relation(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet') cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments' userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename) userdata_parquet_table.validate(full=True) rel = duckdb.from_arrow(userdata_parquet_table) assert(rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] == 149005) assert(rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] == 149005)
def test_binary_types(self, duckdb_cursor): if not can_run: return # Fixed Size Binary arrow_table = create_binary_table(pa.binary(3)) rel = duckdb.from_arrow(arrow_table) res = rel.execute().fetchall() assert res == [(b"foo", ), (b"bar", ), (b"baz", )] # Normal Binary arrow_table = create_binary_table(pa.binary()) rel = duckdb.from_arrow(arrow_table) res = rel.execute().fetchall() assert res == [(b"foo", ), (b"bar", ), (b"baz", )] # Large Binary arrow_table = create_binary_table(pa.large_binary()) rel = duckdb.from_arrow(arrow_table) res = rel.execute().fetchall() assert res == [(b"foo", ), (b"bar", ), (b"baz", )]
def test_max_date(self, duckdb_cursor): if not can_run: return data = (pa.array([2147483647], type=pa.date32()), pa.array([2147483647], type=pa.date32())) result = pa.Table.from_arrays([data[0], data[1]], ['a', 'b']) data = (pa.array([2147483647 * (1000 * 60 * 60 * 24)], type=pa.date64()), pa.array([2147483647], type=pa.date32())) arrow_table = pa.Table.from_arrays([data[0], data[1]], ['a', 'b']) rel = duckdb.from_arrow(arrow_table).arrow() assert (rel['a'] == result['a']) assert (rel['b'] == result['b'])
def test_dictionary_null_index(self, duckdb_cursor): if not can_run: return indices = pa.array([None, 1, 0, 1, 2, 1, 0, 2]) dictionary = pa.array([10, 100, None]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) assert rel.execute().fetchall() == [(None, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ), (None, )] indices = pa.array([None, 1, None, 1, 2, 1, 0]) dictionary = pa.array([10, 100, 100]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) print(rel.execute().fetchall()) assert rel.execute().fetchall() == [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ), (10, )] # Test Big Vector indices_list = [None, 1, None, 1, 2, 1, 0] indices = pa.array(indices_list * 1000) dictionary = pa.array([10, 100, 100]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ), (10, )] * 1000 assert rel.execute().fetchall() == result #Table with dictionary and normal array arrow_table = pa.Table.from_arrays([dict_array, indices], ['a', 'b']) rel = duckdb.from_arrow(arrow_table) result = [(None, None), (100, 1), (None, None), (100, 1), (100, 2), (100, 1), (10, 0)] * 1000 assert rel.execute().fetchall() == result
def test_duration_types(self, duckdb_cursor): if not can_run: return data = (pa.array([1000000000], type=pa.duration('ns')), pa.array([1000000], type=pa.duration('us')), pa.array([1000], pa.duration('ms')), pa.array([1], pa.duration('s'))) arrow_table = pa.Table.from_arrays( [data[0], data[1], data[2], data[3]], ['a', 'b', 'c', 'd']) rel = duckdb.from_arrow(arrow_table).arrow() assert (rel['a'] == arrow_table['c']) assert (rel['b'] == arrow_table['c']) assert (rel['c'] == arrow_table['c']) assert (rel['d'] == arrow_table['c'])
def test_null_type(self, duckdb_cursor): if not can_run: return schema = pa.schema([("data", pa.null())]) inputs = [pa.array([None, None, None], type=pa.null())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) duckdb_conn = duckdb.connect() duckdb_conn.register("testarrow", arrow_table) rel = duckdb.from_arrow(arrow_table).arrow() # We turn it to an array of int32 nulls schema = pa.schema([("data", pa.int32())]) inputs = [pa.array([None, None, None], type=pa.null())] arrow_table = pa.Table.from_arrays(inputs, schema=schema) assert rel['data'] == arrow_table['data']
def test_dictionary_batches(self, duckdb_cursor): if not can_run: return indices_list = [None, 1, None, 1, 2, 1, 0] indices = pa.array(indices_list * 10000) dictionary = pa.array([10, 100, 100]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) batch_arrow_table = pyarrow.Table.from_batches( arrow_table.to_batches(10)) rel = duckdb.from_arrow(batch_arrow_table) result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ), (10, )] * 10000 assert rel.execute().fetchall() == result #Table with dictionary and normal array arrow_table = pa.Table.from_arrays([dict_array, indices], ['a', 'b']) batch_arrow_table = pyarrow.Table.from_batches( arrow_table.to_batches(10)) rel = duckdb.from_arrow(batch_arrow_table) result = [(None, None), (100, 1), (None, None), (100, 1), (100, 2), (100, 1), (10, 0)] * 10000 assert rel.execute().fetchall() == result
def test_dictionary_strings(self, duckdb_cursor): if not can_run: return indices_list = [None, 0, 1, 2, 3, 4, None] indices = pa.array(indices_list * 1000) dictionary = pa.array([ 'Matt Daaaaaaaaamon', 'Alec Baldwin', 'Sean Penn', 'Tim Robbins', 'Samuel L. Jackson' ]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) result = [(None, ), ('Matt Daaaaaaaaamon', ), ('Alec Baldwin', ), ('Sean Penn', ), ('Tim Robbins', ), ('Samuel L. Jackson', ), (None, )] * 1000 assert rel.execute().fetchall() == result
def test_dictionary_timestamps(self, duckdb_cursor): if not can_run: return indices_list = [None, 0, 1, 2, None] indices = pa.array(indices_list * 1000) dictionary = pa.array([ Timestamp(year=2001, month=9, day=25), Timestamp(year=2006, month=11, day=14), Timestamp(year=2012, month=5, day=15), Timestamp(year=2018, month=11, day=2) ]) dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) arrow_table = pa.Table.from_arrays([dict_array], ['a']) rel = duckdb.from_arrow(arrow_table) print(rel.execute().fetchall()) result = [(None, ), (datetime.datetime(2001, 9, 25, 0, 0), ), (datetime.datetime(2006, 11, 14, 0, 0), ), (datetime.datetime(2012, 5, 15, 0, 0), ), (None, )] * 1000 assert rel.execute().fetchall() == result
def test_from_arrow(self, duckdb_cursor): try: import pyarrow as pa except: return conn = duckdb.connect() conn.execute("create table t (a integer)") conn.execute("insert into t values (1)") test_df = pd.DataFrame.from_dict({"i": [1, 2, 3, 4]}) test_arrow = pa.Table.from_pandas(test_df) rel = duckdb.from_arrow(test_arrow, connection=conn) assert rel.query('t_2', 'select count(*) from t inner join t_2 on (a = i)' ).fetchall()[0] == (1, ) rel = duckdb.arrow(test_arrow, connection=conn) assert rel.query('t_2', 'select count(*) from t inner join t_2 on (a = i)' ).fetchall()[0] == (1, )
def test_parallel_scanner_default_conn(self,duckdb_cursor): if not can_run: return parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet') arrow_dataset= pyarrow.dataset.dataset([ parquet_filename, parquet_filename, parquet_filename, ] , format="parquet") scanner_filter = (pc.field("first_name") == pc.scalar('Jose')) & (pc.field("salary") > pc.scalar(134708.82)) arrow_scanner = Scanner.from_dataset(arrow_dataset, filter=scanner_filter) rel = duckdb.from_arrow(arrow_scanner) assert rel.aggregate('count(*)').execute().fetchone()[0] == 12
def test_unsigned_roundtrip(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'unsigned.parquet') data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \ type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\ pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64())) tbl = pyarrow.Table.from_arrays([data[0], data[1], data[2], data[3]], ['a', 'b', 'c', 'd']) pyarrow.parquet.write_table(tbl, parquet_filename) cols = 'a, b, c, d' unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename) unsigned_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute( "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq" ) arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def from_arrow(): data = pa.array(np.random.rand(1_000_000), type=pa.float32()) arrow_table = pa.Table.from_arrays([data],['a']) duckdb.from_arrow(arrow_table)