def test_category_nulls(self, duckdb_cursor): df_in = pd.DataFrame( {'int': pd.Series([1, 2, None], dtype="category")}) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() print(duckdb.query_df(df_in, "data", "SELECT * FROM data").fetchall()) assert df_out['int'][0] == 1 assert df_out['int'][1] == 2 assert numpy.isnan(df_out['int'][2])
def test_category_simple(self, duckdb_cursor): df_in = pd.DataFrame({ 'float': [1.0, 2.0, 1.0], 'int': pd.Series([1, 2, 1], dtype="category") }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() print(duckdb.query_df(df_in, "data", "SELECT * FROM data").fetchall()) print(df_out['int']) assert numpy.all(df_out['float'] == numpy.array([1.0, 2.0, 1.0])) assert numpy.all(df_out['int'] == numpy.array([1, 2, 1]))
def test_query_df(self, duckdb_cursor): conn = duckdb.connect() conn.execute("create table t (a integer)") conn.execute("insert into t values (1),(4)") test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4]}) rel = duckdb.query_df(test_df,'t_2','select * from t inner join t_2 on (a = i)', connection=conn) assert rel.fetchall()[0] == (1,1)
def test_object_integer(self, duckdb_cursor): df_in = pd.DataFrame({ 'int8': pd.Series([None, 1, -1], dtype="Int8"), 'int16': pd.Series([None, 1, -1], dtype="Int16"), 'int32': pd.Series([None, 1, -1], dtype="Int32"), 'int64': pd.Series([None, 1, -1], dtype="Int64") }) df_expected_res = pd.DataFrame({ 'int8': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), 'int16': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), 'int32': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), 'int64': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() pd.testing.assert_frame_equal(df_expected_res, df_out)
def query_df(df, query): """ Perform simple query ('select' from one table, without subqueries and joins) on DataFrame. Args: df (pandas.DataFrame): data query (mindsdb_sql.parser.ast.Select | str): select query Returns: pandas.DataFrame """ query = parse_sql(str(query), dialect='mysql') if isinstance(query, Select) is False or isinstance( query.from_table, Identifier) is False: raise Exception( "Only 'SELECT from TABLE' statements supported for internal query") query.from_table.parts = ['df_table'] for identifier in query.targets: if isinstance(identifier, Identifier): identifier.parts = [identifier.parts[-1]] if isinstance(query.order_by, list): for orderby in query.order_by: if isinstance(orderby, OrderBy) and isinstance( orderby.field, Identifier): orderby.field.parts = [orderby.field.parts[-1]] _remove_table_name(query.where) # FIXME https://github.com/mindsdb/mindsdb_sql/issues/130 # we need way to dump suery in postgres dialect sql_query = str(query).replace('`', '') res = duckdb.query_df(df, 'df_table', sql_query) result_df = res.df() result_df = result_df.where(pd.notnull(result_df), None) return result_df
def _filter_by_sql(df: pd.DataFrame, sql: str) -> pd.DataFrame: """ Filter Pandas DataFrame using an SQL query. The virtual table name is "data", so queries should look like ``SELECT * FROM data;``. This implementation is based on DuckDB, so please have a look at its SQL documentation. - https://duckdb.org/docs/sql/introduction :param sql: A SQL expression. :return: Filtered DataFrame """ import duckdb df = duckdb.query_df(df, "data", sql).df() for column in ( Columns.FROM_DATE.value, Columns.TO_DATE.value, Columns.DATE.value, ): try: df[column] = df[column].dt.tz_localize(pytz.UTC) except KeyError: pass return df
def test_pandas_encoded_utf8(self, duckdb_cursor): data = u'\u00c3' # Unicode data data = [data.encode('utf8')] expected_result = data[0] df_in = pd.DataFrame({'object': pd.Series(data, dtype='object')}) result = duckdb.query_df(df_in, "data", "SELECT * FROM data").fetchone()[0] assert result == str(expected_result)
def round_trip(data, pandas_type): df_in = pd.DataFrame({ 'object': pd.Series(data, dtype=pandas_type), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() print(df_out) print(df_in) assert df_out.equals(df_in)
def test_category_mix(self, duckdb_cursor): df_in = pd.DataFrame({ 'float': [1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 0.0], 'x': pd.Categorical( ['foo', 'bla', None, 'zoo', 'foo', 'foo', None, 'bla'], ordered=True), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_out.equals(df_in)
def test_pandas_float32(self, duckdb_cursor): data = numpy.array([0.1, 0.32, 0.78, numpy.nan]) df_in = pd.DataFrame({ 'object': pd.Series(data, dtype='float32'), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_out['object'][0] == df_in['object'][0] assert df_out['object'][1] == df_in['object'][1] assert df_out['object'][2] == df_in['object'][2] assert numpy.isnan(df_out['object'][3])
def test_pandas_boolean(self, duckdb_cursor): data = numpy.array([True, None, pd.NA, numpy.nan, True]) df_in = pd.DataFrame({ 'object': pd.Series(data, dtype='boolean'), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_out['object'][0] == df_in['object'][0] assert numpy.isnan(df_out['object'][1]) assert numpy.isnan(df_out['object'][2]) assert numpy.isnan(df_out['object'][3]) assert df_out['object'][4] == df_in['object'][4]
def test_category_nulls(self, duckdb_cursor): df_in = pd.DataFrame({ 'string': pd.Series(["foo", "bar", None], dtype="category"), 'int': pd.Series([1, 2, None], dtype="category") }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_out['string'][0] == "foo" assert df_out['string'][1] == "bar" assert numpy.isnan(df_out['string'][2]) assert df_out['int'][0] == 1 assert df_out['int'][1] == 2 assert numpy.isnan(df_out['int'][2])
def test_timestamp_tz(self, duckdb_cursor): df_in = pd.DataFrame({ 'datetime': [pd.Timestamp('20180310T11:17:54Z')], 'string': ['foo'] }) df_expected_res = pd.DataFrame({ 'datetime': [pd.Timestamp('20180310T11:17:54')], 'string': ['foo'] }) print(df_in) print(df_expected_res) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() print(df_out) pd.testing.assert_frame_equal(df_expected_res, df_out)
def test_pandas_interval(self, duckdb_cursor): if pd.__version__ != '1.2.4': return data = numpy.array([2069211000000000, numpy.datetime64("NaT")]) df_in = pd.DataFrame({ 'object': pd.Series(data, dtype='timedelta64[ns]'), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_out['object'][0] == df_in['object'][0] assert pd.isnull(df_out['object'][1])
def test_categorical_fetchall(self, duckdb_cursor): df_in = pd.DataFrame({ 'x': pd.Categorical( ['foo', 'bla', None, 'zoo', 'foo', 'foo', None, 'bla'], ordered=True), }) assert duckdb.query_df(df_in, "data", "SELECT * FROM data").fetchall() == [('foo', ), ('bla', ), (None, ), ('zoo', ), ('foo', ), ('foo', ), (None, ), ('bla', )]
def test_pandas_string(self, duckdb_cursor): strings = numpy.array(['foo', 'bar', 'baz']) # https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html df_in = pd.DataFrame({ 'object': pd.Series(strings, dtype='object'), }) # Only available in pandas 1.0.0 if hasattr(pd, 'StringDtype'): df_in['string'] = pd.Series(strings, dtype=pd.StringDtype()) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert numpy.all(df_out['object'] == strings) if hasattr(pd, 'StringDtype'): assert numpy.all(df_out['string'] == strings)
def query_df(df, query): """ Perform simple query ('select' from one table, without subqueries and joins) on DataFrame. Args: df (pandas.DataFrame): data query (mindsdb_sql.parser.ast.Select | str): select query Returns: pandas.DataFrame """ if isinstance(query, str): query_ast = parse_sql(query, dialect='mysql') else: query_ast = query if isinstance(query_ast, Select) is False or isinstance( query_ast.from_table, Identifier) is False: raise Exception( "Only 'SELECT from TABLE' statements supported for internal query") query_ast.from_table.parts = ['df_table'] for identifier in query_ast.targets: if isinstance(identifier, Identifier): identifier.parts = [identifier.parts[-1]] if isinstance(query_ast.order_by, list): for orderby in query_ast.order_by: if isinstance(orderby, OrderBy) and isinstance( orderby.field, Identifier): orderby.field.parts = [orderby.field.parts[-1]] _remove_table_name(query_ast.where) render = SqlalchemyRender('postgres') try: query_str = render.get_string(query_ast, with_failback=False) except Exception as e: print( f"Exception during query casting to 'postgres' dialect. Query: {str(query)}. Error: {e}" ) query_str = render.get_string(query_ast, with_failback=True) res = duckdb.query_df(df, 'df_table', query_str) result_df = res.df() result_df = result_df.replace({np.nan: None}) return result_df
def filter_by_sql(self, sql: str) -> StationsResult: """ :param sql: :return: """ import duckdb df = self.all().df df: pd.DataFrame = duckdb.query_df(df, "data", sql).df() df.loc[:, Columns.FROM_DATE.value] = df.loc[:, Columns.FROM_DATE. value].dt.tz_localize(self.tz) df.loc[:, Columns.TO_DATE.value] = df.loc[:, Columns.TO_DATE. value].dt.tz_localize(self.tz) return StationsResult(stations=self, df=df.reset_index(drop=True))
def check_create_table(category): conn = duckdb.connect() conn.execute("PRAGMA enable_verification") df_in = pd.DataFrame({ 'x': pd.Categorical(category, ordered=True), 'y': pd.Categorical(category, ordered=True) }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_in.equals(df_out) conn.execute("CREATE TABLE t1 AS SELECT * FROM df_in") conn.execute("CREATE TABLE t2 AS SELECT * FROM df_in") # Check fetchall res = conn.execute("SELECT t1.x FROM t1").fetchall() check_result_list(res, category) # Do a insert to trigger string -> cat conn.execute("INSERT INTO t1 VALUES ('2','2')") res = conn.execute("SELECT x FROM t1 where x = '1'").fetchall() assert res == [('1', )] res = conn.execute( "SELECT t1.x FROM t1 inner join t2 on (t1.x = t2.x)").fetchall() assert res == conn.execute("SELECT x FROM t1").fetchall() res = conn.execute( "SELECT t1.x FROM t1 inner join t2 on (t1.x = t2.y)").fetchall() assert res == conn.execute("SELECT x FROM t1").fetchall() assert res == conn.execute("SELECT x FROM t1").fetchall() # Triggering the cast with ENUM as a src conn.execute("ALTER TABLE t1 ALTER x SET DATA TYPE VARCHAR") # We should be able to drop the table without any dependencies conn.execute("DROP TABLE t1")
def check_category_equal(category): df_in = pd.DataFrame({ 'x': pd.Categorical(category, ordered=True), }) df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df() assert df_in.equals(df_out)