Ejemplo n.º 1
0
    def test_timestamp_overflow(self, duckdb_cursor):
        if not can_run:
            return
        data = (pa.array([9223372036854775807], pa.timestamp('s')),
                pa.array([9223372036854775807], pa.timestamp('ms')),
                pa.array([9223372036854775807], pa.timestamp('us')))
        arrow_table = pa.Table.from_arrays([data[0], data[1], data[2]],
                                           ['a', 'b', 'c'])
        arrow_from_duck = duckdb.from_arrow_table(arrow_table).arrow()
        assert (arrow_from_duck['a'] == arrow_table['a'])
        assert (arrow_from_duck['b'] == arrow_table['b'])
        assert (arrow_from_duck['c'] == arrow_table['c'])

        with pytest.raises(Exception):
            duck_rel = duckdb.from_arrow_table(arrow_table)
            res = duck_rel.project('a::TIMESTAMP_US')
            res.fetchone()

        with pytest.raises(Exception):
            duck_rel = duckdb.from_arrow_table(arrow_table)
            res = duck_rel.project('b::TIMESTAMP_US')
            res.fetchone()

        with pytest.raises(Exception):
            duck_rel = duckdb.from_arrow_table(arrow_table)
            res = duck_rel.project('c::TIMESTAMP_NS')
            res.fetchone()
Ejemplo n.º 2
0
    def test_dictionary_null_index(self,duckdb_cursor):
        if not can_run:
            return
        indices = pa.array([None, 1, 0, 1, 2, 1, 0, 2])
        dictionary = pa.array([10, 100, None])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array],['a'])
        rel = duckdb.from_arrow_table(arrow_table)

        assert rel.execute().fetchall() == [(None,), (100,), (10,), (100,), (None,), (100,), (10,), (None,)]

        indices = pa.array([None, 1, None, 1, 2, 1, 0])
        dictionary = pa.array([10, 100, 100])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array],['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        print (rel.execute().fetchall())
        assert rel.execute().fetchall() == [(None,), (100,), (None,), (100,), (100,), (100,), (10,)]

        # Test Big Vector
        indices_list = [None, 1, None, 1, 2, 1, 0]
        indices = pa.array(indices_list * 1000)
        dictionary = pa.array([10, 100, 100])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array],['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        result = [(None,), (100,), (None,), (100,), (100,), (100,), (10,)] * 1000
        assert rel.execute().fetchall() == result

        #Table with dictionary and normal array
        arrow_table = pa.Table.from_arrays([dict_array,indices],['a','b'])
        rel = duckdb.from_arrow_table(arrow_table)
        result = [(None,None), (100,1), (None,None), (100,1), (100,2), (100,1), (10,0)] * 1000
        assert rel.execute().fetchall() == result
Ejemplo n.º 3
0
    def test_max_times(self, duckdb_cursor):
        if not can_run:
            return
        data = pa.array([2147483647000000], type=pa.time64('us'))
        result = pa.Table.from_arrays([data], ['a'])
        #Max Sec
        data = pa.array([2147483647], type=pa.time32('s'))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table).arrow()
        assert (rel['a'] == result['a'])

        #Max MSec
        data = pa.array([2147483647000], type=pa.time64('us'))
        result = pa.Table.from_arrays([data], ['a'])
        data = pa.array([2147483647], type=pa.time32('ms'))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table).arrow()
        assert (rel['a'] == result['a'])

        #Max NSec
        data = pa.array([9223372036854774], type=pa.time64('us'))
        result = pa.Table.from_arrays([data], ['a'])
        data = pa.array([9223372036854774000], type=pa.time64('ns'))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table).arrow()

        print(rel['a'])
        print(result['a'])
        assert (rel['a'] == result['a'])
Ejemplo n.º 4
0
    def test_dictionary(self, duckdb_cursor):
        if not can_run:
            return
        indices = pa.array([0, 1, 0, 1, 2, 1, 0, 2])
        dictionary = pa.array([10, 100, None])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow_table(arrow_table)

        assert rel.execute().fetchall() == [(10, ), (100, ), (10, ), (100, ),
                                            (None, ), (100, ), (10, ),
                                            (None, )]

        # Bigger than Vector Size
        indices_list = [0, 1, 0, 1, 2, 1, 0, 2, 3] * 10000
        indices = pa.array(indices_list)
        dictionary = pa.array([10, 100, None, 999999])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        result = [(10, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ),
                  (None, ), (999999, )] * 10000
        assert rel.execute().fetchall() == result

        #Table with dictionary and normal array

        arrow_table = pa.Table.from_arrays(
            [dict_array, pa.array(indices_list)], ['a', 'b'])
        rel = duckdb.from_arrow_table(arrow_table)
        result = [(10, 0), (100, 1), (10, 0), (100, 1), (None, 2), (100, 1),
                  (10, 0), (None, 2), (999999, 3)] * 10000
        assert rel.execute().fetchall() == result
Ejemplo n.º 5
0
    def test_list_types(self, duckdb_cursor):
        if not can_run:
            return

        #Large Lists
        data = pyarrow.array([[1], None, [2]],
                             type=pyarrow.large_list(pyarrow.int64()))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [([1], ), (None, ), ([2], )]

        #Fixed Size Lists
        data = pyarrow.array([[1], None, [2]],
                             type=pyarrow.list_(pyarrow.int64(), 1))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [([1], ), (None, ), ([2], )]

        #Complex nested structures with different list types
        data = [
            pyarrow.array([[1], None, [2]],
                          type=pyarrow.list_(pyarrow.int64(), 1)),
            pyarrow.array([[1], None, [2]],
                          type=pyarrow.large_list(pyarrow.int64())),
            pyarrow.array([[1, 2, 3], None, [2, 1]],
                          type=pyarrow.list_(pyarrow.int64()))
        ]
        arrow_table = pa.Table.from_arrays([data[0], data[1], data[2]],
                                           ['a', 'b', 'c'])
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.project('a').execute().fetchall()
        assert res == [([1], ), (None, ), ([2], )]
        res = rel.project('b').execute().fetchall()
        assert res == [([1], ), (None, ), ([2], )]
        res = rel.project('c').execute().fetchall()
        assert res == [([1, 2, 3], ), (None, ), ([2, 1], )]

        #Struct Holding different List Types
        struct = [
            pa.StructArray.from_arrays(data, ['fixed', 'large', 'normal'])
        ]
        arrow_table = pa.Table.from_arrays(struct, ['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [({
            'fixed': [1],
            'large': [1],
            'normal': [1, 2, 3]
        }, ), ({
            'fixed': None,
            'large': None,
            'normal': None
        }, ), ({
            'fixed': [2],
            'large': [2],
            'normal': [2, 1]
        }, )]
Ejemplo n.º 6
0
    def test_unsigned_roundtrip(self,duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','unsigned.parquet')
        data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \
            type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\
                pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64()))

        tbl = pyarrow.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d'])
        pyarrow.parquet.write_table(tbl, parquet_filename)

        cols = 'a, b, c, d'

        unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        unsigned_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(cols).arrow()
        rel_from_duckdb.validate(full=True)

        assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute("select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq")
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)
Ejemplo n.º 7
0
    def test_dictionary_index_types(self, duckdb_cursor):
        if not can_run:
            return
        indices_list = [None, 1, None, 1, 2, 1, 0]
        dictionary = pa.array([10, 100, 100], type=pyarrow.uint8())
        index_types = []
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.uint8()))
        index_types.append(
            pa.array(indices_list * 10000, type=pyarrow.uint16()))
        index_types.append(
            pa.array(indices_list * 10000, type=pyarrow.uint32()))
        index_types.append(
            pa.array(indices_list * 10000, type=pyarrow.uint64()))
        index_types.append(pa.array(indices_list * 10000, type=pyarrow.int8()))
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.int16()))
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.int32()))
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.int64()))

        for index_type in index_types:
            dict_array = pa.DictionaryArray.from_arrays(index_type, dictionary)
            arrow_table = pa.Table.from_arrays([dict_array], ['a'])
            rel = duckdb.from_arrow_table(arrow_table)
            result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ),
                      (10, )] * 10000
            assert rel.execute().fetchall() == result
Ejemplo n.º 8
0
def compare_results(query):
    true_answer = duckdb.query(query).fetchall()
    t = duckdb.query(query).arrow()
    from_arrow = duckdb.from_arrow_table(
        duckdb.query(query).arrow()).fetchall()

    assert true_answer == from_arrow
Ejemplo n.º 9
0
 def test_date_null(self, duckdb_cursor):
     if not can_run:
         return   
     data = (pa.array([None], type=pa.date64()),pa.array([None], type=pa.date32()))
     arrow_table = pa.Table.from_arrays([data[0],data[1]],['a','b'])
     rel = duckdb.from_arrow_table(arrow_table).arrow()
     assert (rel['a'] == arrow_table['b'])
     assert (rel['b'] == arrow_table['b'])
Ejemplo n.º 10
0
 def test_duration_null(self, duckdb_cursor):
     if not can_run:
         return   
     data = (pa.array([None], type=pa.duration('ns')),pa.array([None], type=pa.duration('us')),pa.array([None], pa.duration('ms')),pa.array([None], pa.duration('s')))
     arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d'])
     rel = duckdb.from_arrow_table(arrow_table).arrow()
     assert (rel['a'] == arrow_table['c'])
     assert (rel['b'] == arrow_table['c'])
     assert (rel['c'] == arrow_table['c'])
     assert (rel['d'] == arrow_table['c'])
Ejemplo n.º 11
0
    def test_duration_overflow(self, duckdb_cursor):
        if not can_run:
            return

        # Only seconds can overflow
        data = pa.array([9223372036854775807], pa.duration('s'))
        arrow_table = pa.Table.from_arrays([data],['a'])

        with pytest.raises(Exception):
             arrow_from_duck = duckdb.from_arrow_table(arrow_table).arrow()
Ejemplo n.º 12
0
 def test_max_date(self, duckdb_cursor):
     if not can_run:
         return   
     data = (pa.array([2147483647], type=pa.date32()),pa.array([2147483647], type=pa.date32()))
     result = pa.Table.from_arrays([data[0],data[1]],['a','b'])
     data = (pa.array([2147483647*(1000*60*60*24)], type=pa.date64()),pa.array([2147483647], type=pa.date32()))
     arrow_table = pa.Table.from_arrays([data[0],data[1]],['a','b'])
     rel = duckdb.from_arrow_table(arrow_table).arrow()
     assert (rel['a'] == result['a'])
     assert (rel['b'] == result['b'])
Ejemplo n.º 13
0
    def test_dictionary_batches(self,duckdb_cursor):
        if not can_run:
            return

        indices_list = [None, 1, None, 1, 2, 1, 0]
        indices = pa.array(indices_list * 10000)
        dictionary = pa.array([10, 100, 100])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array],['a'])
        batch_arrow_table = pyarrow.Table.from_batches(arrow_table.to_batches(10))
        rel = duckdb.from_arrow_table(batch_arrow_table)
        result = [(None,), (100,), (None,), (100,), (100,), (100,), (10,)] * 10000
        assert rel.execute().fetchall() == result

        #Table with dictionary and normal array
        arrow_table = pa.Table.from_arrays([dict_array,indices],['a','b'])
        batch_arrow_table = pyarrow.Table.from_batches(arrow_table.to_batches(10))
        rel = duckdb.from_arrow_table(batch_arrow_table)
        result = [(None,None), (100,1), (None,None), (100,1), (100,2), (100,1), (10,0)] * 10000
        assert rel.execute().fetchall() == result
Ejemplo n.º 14
0
    def test_large_string_type(self, duckdb_cursor):
        if not can_run:

            return
        schema = pa.schema([("data", pa.large_string())])
        inputs = [pa.array(["foo", "baaaar", "b"], type=pa.large_string())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)

        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [('foo', ), ('baaaar', ), ('b', )]
Ejemplo n.º 15
0
    def test_dictionary_strings(self,duckdb_cursor):
        if not can_run:
            return

        indices_list = [None, 0, 1, 2, 3, 4, None]
        indices = pa.array(indices_list * 1000)
        dictionary = pa.array(['Matt Daaaaaaaaamon', 'Alec Baldwin', 'Sean Penn', 'Tim Robbins', 'Samuel L. Jackson'])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array],['a'])
        rel = duckdb.from_arrow_table(arrow_table)
        result = [(None,), ('Matt Daaaaaaaaamon',), ( 'Alec Baldwin',), ('Sean Penn',), ('Tim Robbins',), ('Samuel L. Jackson',), (None,)] * 1000
        assert rel.execute().fetchall() == result
Ejemplo n.º 16
0
    def test_binary_types(self, duckdb_cursor):
        if not can_run:
            return

        # Fixed Size Binary
        arrow_table = create_binary_table(pa.binary(3))
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [(b"foo", ), (b"bar", ), (b"baz", )]

        # Normal Binary
        arrow_table = create_binary_table(pa.binary())
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [(b"foo", ), (b"bar", ), (b"baz", )]

        # Large Binary
        arrow_table = create_binary_table(pa.large_binary())
        rel = duckdb.from_arrow_table(arrow_table)
        res = rel.execute().fetchall()
        assert res == [(b"foo", ), (b"bar", ), (b"baz", )]
Ejemplo n.º 17
0
 def test_dictionary_timestamps(self,duckdb_cursor):
     if not can_run:
         return
     indices_list = [None, 0, 1, 2, None]
     indices = pa.array(indices_list * 1000)
     dictionary = pa.array([Timestamp(year=2001, month=9, day=25),Timestamp(year=2006, month=11, day=14),Timestamp(year=2012, month=5, day=15),Timestamp(year=2018, month=11, day=2)])
     dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
     arrow_table = pa.Table.from_arrays([dict_array],['a'])
     rel = duckdb.from_arrow_table(arrow_table)
     print (rel.execute().fetchall())
     result = [(None,), (datetime.datetime(2001, 9, 25, 0, 0),), (datetime.datetime(2006, 11, 14, 0, 0),), (datetime.datetime(2012, 5, 15, 0, 0),), (None,)] * 1000
     assert rel.execute().fetchall() == result
Ejemplo n.º 18
0
    def test_arrow(self, duckdb_cursor):

        if not can_run:
            return

        parquet_filename = 'userdata1.parquet'
        urllib.request.urlretrieve(
            'https://github.com/cwida/duckdb-data/releases/download/v1.0/userdata1.parquet',
            parquet_filename)

        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        # TODO timestamp

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        userdata_parquet_table.validate(full=True)

        rel_from_arrow = duckdb.arrow(userdata_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        # batched version, lets use various values for batch size
        for i in [7, 51, 99, 100, 101, 500, 1000, 2000]:
            userdata_parquet_table2 = pyarrow.Table.from_batches(
                userdata_parquet_table.to_batches(i))
            assert userdata_parquet_table.equals(userdata_parquet_table2,
                                                 check_metadata=True)

            rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project(
                cols).arrow()
            rel_from_arrow2.validate(full=True)

            assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True)
            assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute(
            "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq"
        )
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)
Ejemplo n.º 19
0
    def test_parallel_types_and_different_batches(self, duckdb_cursor):
        if not can_run:
            return
        duckdb_conn = duckdb.connect()
        duckdb_conn.execute("PRAGMA threads=4")
        duckdb_conn.execute("PRAGMA force_parallelism")

        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'userdata1.parquet')
        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        for i in [7, 51, 99, 100, 101, 500, 1000, 2000]:
            data = (pyarrow.array(np.arange(3, 7), type=pyarrow.int32()))
            tbl = pyarrow.Table.from_arrays([data], ['a'])
            rel_id = duckdb.from_arrow_table(tbl)
            userdata_parquet_table2 = pyarrow.Table.from_batches(
                userdata_parquet_table.to_batches(i))
            rel = duckdb.from_arrow_table(userdata_parquet_table2)
            result = rel.filter("first_name=\'Jose\' and salary > 134708.82"
                                ).aggregate('count(*)')
            assert (result.execute().fetchone()[0] == 4)
Ejemplo n.º 20
0
 def test_timestamp_types(self, duckdb_cursor):
     if not can_run:
         return
     data = (pa.array([datetime.datetime.now()], type=pa.timestamp('ns')),
             pa.array([datetime.datetime.now()], type=pa.timestamp('us')),
             pa.array([datetime.datetime.now()], pa.timestamp('ms')),
             pa.array([datetime.datetime.now()], pa.timestamp('s')))
     arrow_table = pa.Table.from_arrays(
         [data[0], data[1], data[2], data[3]], ['a', 'b', 'c', 'd'])
     rel = duckdb.from_arrow_table(arrow_table).arrow()
     assert (rel['a'] == arrow_table['a'])
     assert (rel['b'] == arrow_table['b'])
     assert (rel['c'] == arrow_table['c'])
     assert (rel['d'] == arrow_table['d'])
Ejemplo n.º 21
0
    def test_parallel_fewer_batches_than_threads(self, duckdb_cursor):
        if not can_run:
            return
        duckdb_conn = duckdb.connect()
        duckdb_conn.execute("PRAGMA threads=4")
        duckdb_conn.execute("PRAGMA force_parallelism")

        data = (pyarrow.array(np.random.randint(800, size=1000),
                              type=pyarrow.int32()))
        tbl = pyarrow.Table.from_batches(
            pyarrow.Table.from_arrays([data], ['a']).to_batches(2))
        rel = duckdb.from_arrow_table(tbl)
        # Also test multiple reads
        assert (
            rel.aggregate("(count(a))::INT").execute().fetchone()[0] == 1000)
Ejemplo n.º 22
0
    def test_multiple_queries_same_relation(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'userdata1.parquet')
        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        userdata_parquet_table.validate(full=True)
        rel = duckdb.from_arrow_table(userdata_parquet_table)
        assert (rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] ==
                149005)
        assert (rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] ==
                149005)
Ejemplo n.º 23
0
    def test_null_type(self, duckdb_cursor):
        if not can_run:
            return
        schema = pa.schema([("data", pa.null())])
        inputs = [pa.array([None, None, None], type=pa.null())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)
        duckdb_conn = duckdb.connect()
        duckdb_conn.register("testarrow", arrow_table)
        rel = duckdb.from_arrow_table(arrow_table).arrow()
        # We turn it to an array of int32 nulls
        schema = pa.schema([("data", pa.int32())])
        inputs = [pa.array([None, None, None], type=pa.null())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)

        assert rel['data'] == arrow_table['data']
Ejemplo n.º 24
0
    def test_from_arrow(self, duckdb_cursor):
        try:
            import pyarrow as pa
        except:
            return

        conn = duckdb.connect()
        conn.execute("create table t (a integer)")
        conn.execute("insert into t values (1)")
        test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4]})
        test_arrow = pa.Table.from_pandas(test_df)
        rel = duckdb.from_arrow_table(test_arrow, connection=conn)
        assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] ==  (1,)
        rel = duckdb.arrow(test_arrow, connection=conn)
        assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] ==  (1,)
Ejemplo n.º 25
0
def run_arrow(commit_hash,column_name,experiment_name,duck_con):
    import statistics,duckdb
    duck_to_arrow = []
    arrow_to_duck = []
    for i in range(6):
        duck_con.execute("select " + column_name +  " from t;")

        start_time = time.time()
        result = duck_con.fetch_arrow_table()
        time_duck_to_arrow = time.time() - start_time

        start_time = time.time()
        result = duckdb.from_arrow_table(result)
        time_arrow_to_duck = time.time() - start_time

        if i!= 0:
            duck_to_arrow.append(time_duck_to_arrow)
            arrow_to_duck.append(time_arrow_to_duck)

    (benchmark_id, groupname) = insert_benchmark_info('duckdb -> arrow ' + experiment_name,'arrow_integration','')
    c.execute("INSERT INTO timings (benchmark_id, hash, success, median) VALUES (?, ?, ?, ?)", (benchmark_id, commit_hash, True, statistics.median(duck_to_arrow)))
    (benchmark_id, groupname) = insert_benchmark_info('arrow -> duckdb ' + experiment_name,'arrow_integration','')
    c.execute("INSERT INTO timings (benchmark_id, hash, success, median) VALUES (?, ?, ?, ?)", (benchmark_id, commit_hash, True, statistics.median(arrow_to_duck)))
    con.commit()