def pd_writer(table: pandas.io.sql.SQLTable, conn: Union['sqlalchemy.engine.Engine', 'sqlalchemy.engine.Connection'], keys: Iterable, data_iter: Iterable) -> None: """ This is a wrapper on top of write_pandas to make it compatible with to_sql method in pandas. :Example: import pandas as pd from snowflake.connector.pandas_tools import pd_writer sf_connector_version_df = pd.DataFrame([('snowflake-connector-python', '1.0')], columns=['NAME', 'NEWEST_VERSION']) sf_connector_version_df.to_sql('driver_versions', engine, index=False, method=pd_writer) @param table: Pandas package's table object @param conn: SQLAlchemy engine object to talk to Snowflake @param keys: Column names that we are trying to insert @param data_iter: Iterator over the rows @return: None """ sf_connection = conn.connection.connection df = pandas.DataFrame(data_iter, columns=keys) write_pandas( conn=sf_connection, df=df, # Note: Our sqlalchemy connector creates tables case insensitively table_name=table.name.upper(), schema=table.schema)
def test_auto_create_table_similar_column_names( conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]], ): """Tests whether similar names do not cause issues when auto-creating a table as expected.""" table_name = random_string(5, "numbas_") df_data = [(10, 11), (20, 21)] df = pandas.DataFrame(df_data, columns=["number", "Number"]) select_sql = f'SELECT * FROM "{table_name}"' drop_sql = f'DROP TABLE IF EXISTS "{table_name}"' with conn_cnx() as cnx: try: success, nchunks, nrows, _ = write_pandas(cnx, df, table_name, quote_identifiers=True, auto_create_table=True) # Check write_pandas output assert success assert nrows == len(df_data) assert nchunks == 1 # Check table's contents result = cnx.cursor(DictCursor).execute(select_sql).fetchall() for row in result: assert ( row["number"], row["Number"], ) in df_data finally: cnx.execute_string(drop_sql)
def pd_writer(table: 'pandas.io.sql.SQLTable', conn: Union['sqlalchemy.engine.Engine', 'sqlalchemy.engine.Connection'], keys: Iterable, data_iter: Iterable, quote_identifiers: bool = True) -> None: """This is a wrapper on top of write_pandas to make it compatible with to_sql method in pandas. Example usage: import pandas as pd from snowflake.connector.pandas_tools import pd_writer sf_connector_version_df = pd.DataFrame([('snowflake-connector-python', '1.0')], columns=['NAME', 'NEWEST_VERSION']) sf_connector_version_df.to_sql('driver_versions', engine, index=False, method=pd_writer) # to use quote_identifiers=False from functools import partial sf_connector_version_df.to_sql( 'driver_versions', engine, index=False, method=partial(pd_writer, quote_identifiers=False)) Args: table: Pandas package's table object. conn: SQLAlchemy engine object to talk to Snowflake. keys: Column names that we are trying to insert. data_iter: Iterator over the rows. quote_identifiers: if True (default), quote identifiers passed to Snowflake. If False, identifiers are not quoted (and typically coerced to uppercase by Snowflake) """ sf_connection = conn.connection.connection df = pandas.DataFrame(data_iter, columns=keys) write_pandas(conn=sf_connection, df=df, # Note: Our sqlalchemy connector creates tables case insensitively table_name=table.name.upper(), schema=table.schema, quote_identifiers=quote_identifiers)
def test_default_value_insertion( conn_cnx: Callable[..., Generator["SnowflakeConnection", None, None]], quote_identifiers: bool, ): """Tests whether default values can be successfully inserted with the pandas writeback.""" table_name = "users" df_data = [("Mark", 10), ("Luke", 20)] # Create a DataFrame containing data about customers df = pandas.DataFrame(df_data, columns=["name", "balance"]) # Assume quote_identifiers is true in string and if not remove " from strings create_sql = """CREATE OR REPLACE TABLE "{}" ("name" STRING, "balance" INT, "id" varchar(36) default uuid_string(), "ts" timestamp_ltz default current_timestamp)""".format( table_name) select_sql = 'SELECT * FROM "{}"'.format(table_name) drop_sql = 'DROP TABLE IF EXISTS "{}"'.format(table_name) if not quote_identifiers: create_sql = create_sql.replace('"', "") select_sql = select_sql.replace('"', "") drop_sql = drop_sql.replace('"', "") with conn_cnx() as cnx: # type: SnowflakeConnection cnx.execute_string(create_sql) try: success, nchunks, nrows, _ = write_pandas( cnx, df, table_name, quote_identifiers=quote_identifiers) # Check write_pandas output assert success assert nrows == len(df_data) assert nchunks == 1 # Check table's contents result = cnx.cursor(DictCursor).execute(select_sql).fetchall() for row in result: assert (row["id" if quote_identifiers else "ID"] is not None) # ID (UUID String) assert len(row["id" if quote_identifiers else "ID"]) == 36 assert (row["ts" if quote_identifiers else "TS"] is not None) # TS (Current Timestamp) assert isinstance(row["ts" if quote_identifiers else "TS"], datetime) assert ( row["name" if quote_identifiers else "NAME"], row["balance" if quote_identifiers else "BALANCE"], ) in df_data finally: cnx.execute_string(drop_sql)
def test_resultbatches_pandas_functionality(conn_cnx): """Fetch ArrowResultBatches as pandas dataframes and check its result.""" rowcount = 100000 expected_df = pandas.DataFrame(data={"A": range(rowcount)}) with conn_cnx() as con: with con.cursor() as cur: cur.execute( f"select seq4() a from table(generator(rowcount => {rowcount}));" ) assert cur._result_set.total_row_index() == rowcount result_batches = cur.get_result_batches() assert len(result_batches) > 1 tables = itertools.chain.from_iterable( list(b.create_iter(iter_unit=TABLE_UNIT)) for b in result_batches) final_df = pyarrow.concat_tables(tables).to_pandas() assert numpy.array_equal(expected_df, final_df)
def test_special_name_quoting( conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]], auto_create_table: bool, ): """Tests whether special column names get quoted as expected.""" table_name = "users" df_data = [("Mark", 10), ("Luke", 20)] df = pandas.DataFrame(df_data, columns=["00name", "bAlance"]) create_sql = (f'CREATE OR REPLACE TABLE "{table_name}"' '("00name" STRING, "bAlance" INT, "id" INT AUTOINCREMENT)') select_sql = f'SELECT * FROM "{table_name}"' drop_sql = f'DROP TABLE IF EXISTS "{table_name}"' with conn_cnx() as cnx: # type: SnowflakeConnection if not auto_create_table: cnx.execute_string(create_sql) try: success, nchunks, nrows, _ = write_pandas( cnx, df, table_name, quote_identifiers=True, auto_create_table=auto_create_table, ) # Check write_pandas output assert success assert nrows == len(df_data) assert nchunks == 1 # Check table's contents result = cnx.cursor(DictCursor).execute(select_sql).fetchall() for row in result: # The auto create table functionality does not auto-create an incrementing ID if not auto_create_table: assert row["id"] in (1, 2) assert ( row["00name"], row["bAlance"], ) in df_data finally: cnx.execute_string(drop_sql)
def test_autoincrement_insertion( conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]], quote_identifiers: bool, ): """Tests whether default values can be successfully inserted with the pandas writeback.""" table_name = "users" df_data = [("Mark", 10), ("Luke", 20)] # Create a DataFrame containing data about customers df = pandas.DataFrame(df_data, columns=["name", "balance"]) # Assume quote_identifiers is true in string and if not remove " from strings create_sql = ('CREATE OR REPLACE TABLE "{}"' '("name" STRING, "balance" INT, "id" INT AUTOINCREMENT)' ).format(table_name) select_sql = f'SELECT * FROM "{table_name}"' drop_sql = f'DROP TABLE IF EXISTS "{table_name}"' if not quote_identifiers: create_sql = create_sql.replace('"', "") select_sql = select_sql.replace('"', "") drop_sql = drop_sql.replace('"', "") with conn_cnx() as cnx: # type: SnowflakeConnection cnx.execute_string(create_sql) try: success, nchunks, nrows, _ = write_pandas( cnx, df, table_name, quote_identifiers=quote_identifiers) # Check write_pandas output assert success assert nrows == len(df_data) assert nchunks == 1 # Check table's contents result = cnx.cursor(DictCursor).execute(select_sql).fetchall() for row in result: assert row["id" if quote_identifiers else "ID"] in (1, 2) assert ( row["name" if quote_identifiers else "NAME"], row["balance" if quote_identifiers else "BALANCE"], ) in df_data finally: cnx.execute_string(drop_sql)
def fetch_pandas(conn_cnx, sql, row_count, col_count, method='one'): """Tests that parameters can be customized. Args: conn_cnx: Connection object. sql: SQL command for execution. row_count: Number of total rows combining all dataframes. col_count: Number of columns in dataframe. method: If method is 'batch', we fetch dataframes in batch. If method is 'one', we fetch a single dataframe containing all data (Default value = 'one'). """ assert row_count != 0, '# of rows should be larger than 0' assert col_count != 0, '# of columns should be larger than 0' with conn_cnx() as cnx_row: with conn_cnx() as cnx_table: # fetch dataframe by fetching row by row cursor_row = cnx_row.cursor() cursor_row.execute(SQL_ENABLE_ARROW) cursor_row.execute(sql) # build dataframe # actually its exec time would be different from `pd.read_sql()` via sqlalchemy as most people use # further perf test can be done separately start_time = time.time() rows = 0 if method == 'one': df_old = pd.DataFrame(cursor_row.fetchall(), columns=['c{}'.format(i) for i in range(col_count)]) else: print("use fetchmany") while True: dat = cursor_row.fetchmany(10000) if not dat: break else: df_old = pd.DataFrame(dat, columns=['c{}'.format(i) for i in range(col_count)]) rows += df_old.shape[0] end_time = time.time() print('The original way took {}s'.format(end_time - start_time)) cursor_row.close() # fetch dataframe with new arrow support cursor_table = cnx_table.cursor() cursor_table.execute(SQL_ENABLE_ARROW) cursor_table.execute(sql) # build dataframe total_rows, total_batches = 0, 0 start_time = time.time() if method == 'one': df_new = cursor_table.fetch_pandas_all() total_rows = df_new.shape[0] else: for df_new in cursor_table.fetch_pandas_batches(): total_rows += df_new.shape[0] total_batches += 1 end_time = time.time() print('new way (fetching {}) took {}s'.format(method, end_time - start_time)) if method == 'batch': print('new way has # of batches : {}'.format(total_batches)) cursor_table.close() assert total_rows == row_count, 'there should be {} rows, but {} rows'.format(row_count, total_rows) # verify the correctness # only do it when fetch one dataframe if method == 'one': assert df_old.shape == df_new.shape, 'the shape of old dataframe is {}, the shape of new dataframe is {}, \ shapes are not equal'.format(df_old.shape, df_new.shape) for i in range(row_count): col_old = df_old.iloc[i] col_new = df_new.iloc[i] for j, (c_old, c_new) in enumerate(zip(col_old, col_new)): assert c_old == c_new, '{} row, {} column: old value is {}, new value is {}, \ values are not equal'.format(i, j, c_old, c_new) else: assert rows == total_rows, 'the number of rows are not equal {} vs {}'.format(rows, total_rows)
pandas = None write_pandas = None MYPY = False if MYPY: # from typing import TYPE_CHECKING once 3.5 is deprecated from snowflake.connector import SnowflakeConnection sf_connector_version_data = [ ("snowflake-connector-python", "1.2.23"), ("snowflake-sqlalchemy", "1.1.1"), ("snowflake-connector-go", "0.0.1"), ("snowflake-go", "1.0.1"), ("snowflake-odbc", "3.12.3"), ] sf_connector_version_df = LazyVar(lambda: pandas.DataFrame( sf_connector_version_data, columns=["name", "newest_version"])) @pytest.mark.parametrize("chunk_size", [5, 4, 3, 2, 1]) @pytest.mark.parametrize("compression", ["gzip", "snappy"]) # Note: since the file will to small to chunk, this is only testing the put command's syntax @pytest.mark.parametrize("parallel", [4, 99]) @pytest.mark.parametrize("quote_identifiers", [True, False]) def test_write_pandas( conn_cnx: Callable[..., Generator["SnowflakeConnection", None, None]], db_parameters: Dict[str, str], compression: str, parallel: int, chunk_size: int, quote_identifiers: bool, ):
from ...lazy_var import LazyVar MYPY = False if MYPY: # from typing import TYPE_CHECKING once 3.5 is deprecated from snowflake.connector import SnowflakeConnection sf_connector_version_data = [ ('snowflake-connector-python', '1.2.23'), ('snowflake-sqlalchemy', '1.1.1'), ('snowflake-connector-go', '0.0.1'), ('snowflake-go', '1.0.1'), ('snowflake-odbc', '3.12.3'), ] sf_connector_version_df = LazyVar(lambda: pandas.DataFrame( sf_connector_version_data, columns=['name', 'newest_version'])) @pytest.mark.parametrize('chunk_size', [5, 4, 3, 2, 1]) @pytest.mark.parametrize('compression', ['gzip', 'snappy']) # Note: since the file will to small to chunk, this is only testing the put command's syntax @pytest.mark.parametrize('parallel', [4, 99]) @pytest.mark.parametrize('quote_identifiers', [True, False]) def test_write_pandas(conn_cnx: Callable[..., Generator['SnowflakeConnection', None, None]], db_parameters: Dict[str, str], compression: str, parallel: int, chunk_size: int, quote_identifiers: bool): num_of_chunks = math.ceil(len(sf_connector_version_data) / chunk_size) with conn_cnx(user=db_parameters['user'], account=db_parameters['account'],
def test_all_pandas_types(conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]]): table_name = random_string(5, "all_types_") datetime_with_tz = datetime(1997, 6, 3, 14, 21, 32, 00, tzinfo=timezone(timedelta(hours=+10))) datetime_with_ntz = datetime(1997, 6, 3, 14, 21, 32, 00) df_data = [ (1, 1.1, "1string1", True, datetime_with_tz, datetime_with_ntz), (2, 2.2, "2string2", False, datetime_with_tz, datetime_with_ntz), ] df_data_no_timestamps = [( row[0], row[1], row[2], row[3], ) for row in df_data] df = pandas.DataFrame( df_data, columns=[ "int", "float", "string", "bool", "timestamp_tz", "timestamp_ntz" ], ) select_sql = f'SELECT * FROM "{table_name}"' drop_sql = f'DROP TABLE IF EXISTS "{table_name}"' with conn_cnx() as cnx: try: success, nchunks, nrows, _ = write_pandas(cnx, df, table_name, quote_identifiers=True, auto_create_table=True) # Check write_pandas output assert success assert nrows == len(df_data) assert nchunks == 1 # Check table's contents result = cnx.cursor(DictCursor).execute(select_sql).fetchall() for row in result: assert ( row["int"], row["float"], row["string"], row["bool"], ) in df_data_no_timestamps # TODO: Schema detection on the server-side has bugs dealing with timestamp_ntz and timestamp_tz. # After the bugs are fixed, change the assertion to `data[0]["tm_tz"] == datetime_with_tz` # and `data[0]["tm_ntz"] == datetime_with_ntz`, # JIRA https://snowflakecomputing.atlassian.net/browse/SNOW-524865 # JIRA https://snowflakecomputing.atlassian.net/browse/SNOW-359205 # JIRA https://snowflakecomputing.atlassian.net/browse/SNOW-507644 assert row["timestamp_tz"] is not None assert row["timestamp_ntz"] is not None finally: cnx.execute_string(drop_sql)