def testReadSQLUseArrowDtype(self): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), 'd': [datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10)]}) with tempfile.TemporaryDirectory() as d: table_name = 'test' uri = 'sqlite:///' + os.path.join(d, 'test.db') test_df.to_sql(table_name, uri, index=False) r = md.read_sql_table('test', uri, chunk_size=4, use_arrow_dtype=True) result = self.executor.execute_dataframe(r, concat=True)[0] self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) # test read with sql string and offset method r = md.read_sql_query('select * from test where c > 0.5', uri, parse_dates=['d'], chunk_size=4, incremental_index=True, use_arrow_dtype=True) result = self.executor.execute_dataframe(r, concat=True)[0] self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df[test_df.c > 0.5].reset_index(drop=True))
def testReadSQLExecution(self): import sqlalchemy as sa test_df = pd.DataFrame({ 'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), 'd': [ datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10) ] }) with tempfile.TemporaryDirectory() as d: table_name = 'test' table_name2 = 'test2' uri = 'sqlite:///' + os.path.join(d, 'test.db') test_df.to_sql(table_name, uri, index=False) # test read with table name r = md.read_sql_table('test', uri, chunk_size=4) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) # test read with sql string and offset method r = md.read_sql_query('select * from test where c > 0.5', uri, parse_dates=['d'], chunk_size=4, incremental_index=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.c > 0.5].reset_index(drop=True)) # test read with sql string and partition method with integer cols r = md.read_sql('select * from test where b > \'s5\'', uri, parse_dates=['d'], partition_col='a', num_partitions=3, incremental_index=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.b > 's5'].reset_index(drop=True)) # test read with sql string and partition method with datetime cols r = md.read_sql_query('select * from test where b > \'s5\'', uri, parse_dates={'d': '%Y-%m-%d %H:%M:%S'}, partition_col='d', num_partitions=3, incremental_index=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.b > 's5'].reset_index(drop=True)) # test read with sql string and partition method with datetime cols r = md.read_sql_query('select * from test where b > \'s5\'', uri, parse_dates=['d'], partition_col='d', num_partitions=3, index_col='d') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, test_df[test_df.b > 's5'].set_index('d')) # test SQL that return no result r = md.read_sql_query('select * from test where a > 1000', uri) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal( result, pd.DataFrame(columns=test_df.columns)) engine = sa.create_engine(uri) m = sa.MetaData() try: # test index_col and columns r = md.read_sql_table('test', engine.connect(), chunk_size=4, index_col='a', columns=['b', 'd']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = test_df.copy(deep=True) expected.set_index('a', inplace=True) del expected['c'] pd.testing.assert_frame_equal(result, expected) # do not specify chunk_size r = md.read_sql_table('test', engine.connect(), index_col='a', columns=['b', 'd']) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) table = sa.Table(table_name, m, autoload=True, autoload_with=engine) r = md.read_sql_table( table, engine, chunk_size=4, index_col=[table.columns['a'], table.columns['b']], columns=[table.columns['c'], 'd']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = test_df.copy(deep=True) expected.set_index(['a', 'b'], inplace=True) pd.testing.assert_frame_equal(result, expected) # test table with primary key sa.Table(table_name2, m, sa.Column('id', sa.Integer, primary_key=True), sa.Column('a', sa.Integer), sa.Column('b', sa.String), sa.Column('c', sa.Float), sa.Column('d', sa.DateTime)) m.create_all(engine) test_df = test_df.copy(deep=True) test_df.index.name = 'id' test_df.to_sql(table_name2, uri, if_exists='append') r = md.read_sql_table(table_name2, engine, chunk_size=4, index_col='id') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) finally: engine.dispose()