def __init__(self, **kwargs): self.LOGGER = singer.get_logger() self.configured_warehouse = kwargs.get('warehouse') self.configured_database = kwargs.get('database') self.configured_schema = kwargs.get('schema') SnowflakeConnection.__init__(self, **kwargs)
def __enter__(self): if self._connection is None: self._connection = SnowflakeConnection( user=self.user, password=self.password, account=self.account, session_parameters={"QUERY_TAG": self._query_tag}, application="DBND Snowflake plugin {}".format(__version__), ) # if and only if we opened connection - we should close it self._should_close = True return self
def patch_connection( self, con: SnowflakeConnection, propagate: bool = True, ) -> Generator[TelemetryCaptureHandler, None, None]: original_telemetry = con._telemetry new_telemetry = TelemetryCaptureHandler( original_telemetry, propagate, ) con._telemetry = new_telemetry try: yield new_telemetry finally: con._telemetry = original_telemetry
def _describe( self, connection: SnowflakeConnection, query: str, ) -> Dict[str, str]: description_start = timer() cursor = connection.cursor(DictCursor) describe_res = cursor.describe(query) description_time = timer() - description_start self.logger.info( f'[benchmark][snowflake] - description {description_time} seconds', extra={ 'benchmark': { 'operation': 'describe', 'description_time': description_time, 'connector': 'snowflake', 'query': query, 'result': json.dumps(describe_res), } }, ) res = { r.name: type_code_mapping.get(r.type_code) for r in describe_res } return res
def report_operations(self, connection: SnowflakeConnection, operations: List[SqlOperation]): if connection.is_closed(): # already closed, cannot proceed (and probably already tracked) return # update the tables names operations = [op.evolve_table_name(connection) for op in operations] # looks for tables schemas tables = chain.from_iterable(op.tables for op in operations if not op.is_file) tables_schemas: Dict[str, DTypes] = {} for table in tables: table_schema = get_snowflake_table_schema(connection, table) if table_schema: tables_schemas[table] = table_schema operations: List[SqlOperation] = [ op.evolve_schema(tables_schemas) for op in operations ] for op in operations: log_dataset_op( op_path=render_connection_path(connection, op, "snowflake"), op_type=op.op_type, success=op.success, data=op, with_schema=True, send_metrics=True, )
def alive_function(conn: SnowflakeConnection) -> Any: logger.debug('Check Snowflake connection') if hasattr(conn, 'is_closed'): try: return not conn.is_closed() except Exception: raise TypeError('is_closed is not a function')
def snowflake_query(connection: SnowflakeConnection, query: str, params=None): try: with connection.cursor(DictCursor) as cursor: cursor.execute(query, params) result = cursor.fetchall() return result except Exception: logger.exception("Error occurred during querying Snowflake, query: %s", query) raise
def close_function(conn: SnowflakeConnection) -> None: logger.debug('Close Snowflake connection') if hasattr(conn, 'close'): try: close_start = timer() r = conn.close() close_end = timer() logger.info( f'[benchmark][snowflake] - close {close_end - close_start} seconds', extra={ 'benchmark': { 'operation': 'close', 'execution_time': close_end - close_start, 'connector': 'snowflake', } }, ) return r except Exception: raise TypeError('close is not a function')
def _execute_query_internal( self, connection: SnowflakeConnection, query: str, query_parameters: Optional[dict] = None, ) -> pd.DataFrame: execution_start = timer() cursor = connection.cursor(DictCursor) query_res = cursor.execute(query, query_parameters) query_generation_time = timer() - execution_start self.logger.info( f'[benchmark][snowflake] - execute {query_generation_time} seconds', extra={ 'benchmark': { 'operation': 'execute', 'query_generation_time': query_generation_time, 'connector': 'snowflake', 'query': query, } }, ) self.set_query_generation_time(query_generation_time) convert_start = timer() # Here call our customized fetch values = pd.DataFrame.from_dict(query_res.fetchall()) data_conversion_time = timer() - convert_start self.logger.info( f'[benchmark][snowflake] - dataframe {data_conversion_time} seconds', extra={ 'benchmark': { 'operation': 'dataframe', 'data_conversion_time': data_conversion_time, 'connector': 'snowflake', } }, ) self.set_data_conversion_time(data_conversion_time) return values # do not return metadata from now
class SnowflakeController: """ Interacts with Snowflake, queries it""" def __init__(self, connection_or_connection_string): # type: (Union[str, SnowflakeConnection]) -> SnowflakeController if isinstance(connection_or_connection_string, SnowflakeConnection): self._connection = connection_or_connection_string self.account = self._connection.account self.user = self._connection.user self.password = "******" else: conn_params = conn_str_to_conn_params( connection_or_connection_string) self.account = conn_params["account"] self.user = conn_params["user"] self.password = conn_params["password"] self._connection = None self._should_close = False self._query_tag = "dbnd-snowflake" self._column_types = None def __enter__(self): if self._connection is None: self._connection = SnowflakeConnection( user=self.user, password=self.password, account=self.account, session_parameters={"QUERY_TAG": self._query_tag}, application="DBND Snowflake plugin {}".format(__version__), ) # if and only if we opened connection - we should close it self._should_close = True return self def __exit__(self, exc_type, exc_val, exc_tb): if self._should_close and self._connection is not None: self._connection.close() def to_preview(self, table): # type: (SnowflakeTable) -> str # snowflake.connector does not handle incorrect utf-8 data fetched from db, # hence this fillding with encode/decode column_types = self.get_column_types(table) columns = ",".join( 'TRY_HEX_DECODE_STRING(HEX_ENCODE("{0}")) AS {0}'.format(column) for column in column_types.keys()) rows = self.query( "select {0} from {1.db_with_schema}.{1.table_name} limit {1.preview_rows}" .format(columns, table)) preview_table = tabulate(rows, headers="keys") + "\n..." return preview_table def get_dimensions(self, table): # type: (SnowflakeTable) -> Dict if table.schema: in_cond = "in schema {0.db_with_schema}".format(table) else: in_cond = "in database {0.database}".format(table) table_meta = self.query("SHOW TABLES LIKE '{0}' {1}".format( table.table_name, in_cond)) if len(table_meta) != 1: raise SnowflakeError( "Snowflake table not found: '{0.table_name}', DB: '{0.database}'" .format(table)) cols = len(self.get_column_types(table)) return { "rows": table_meta[0]["rows"], "cols": cols, "bytes": table_meta[0]["bytes"], } def get_column_types(self, table): # type: (SnowflakeTable) -> Dict[str, str] if self._column_types is not None: return self._column_types query = dedent("""\ SELECT column_name, data_type FROM {0.database}.information_schema.columns WHERE LOWER(table_name) = LOWER('{0.table_name}')""").format(table) if table.schema: query += " and LOWER(table_schema) = LOWER('{0.schema}')".format( table) results = self.query(query) if not results: raise SnowflakeError( "Table columns not found. Snowflake DB: '{0.database}', " "schema: {0.schema} table: '{0.table_name}'\n" "Query used: {1}".format(table, query)) self._column_types = { row["COLUMN_NAME"]: row["DATA_TYPE"] for row in results } return self._column_types def query(self, query, params=None): try: with self._connection.cursor(DictCursor) as cursor: cursor.execute(query, params) result = cursor.fetchall() return result except Exception: logger.exception( "Error occurred during querying Snowflake, query: %s", query) raise def __str__(self): return "snowflake://{user}:***@{account}".format(account=self.account, user=self.user) def get_resource_usage( self, database: str, query_id: str, session_id: Optional[int], key: str, history_window: float, query_history_result_limit: int, delay: int, retries: int, retry_pause: float, raise_on_error: bool, config: SnowflakeConfig, ) -> Dict: if delay > 0: logger.info("Delaying search in QUERY_HISTORY for %s seconds", delay) sleep(delay) result_limit = min(query_history_result_limit, config.query_history_result_limit_max_value) tries, sf_query = 0, "" try: while (result_limit <= config.query_history_result_limit_max_value and tries <= retries): resource_metrics, sf_query = self._query_resource_usage( database, query_id=query_id, session_id=session_id, key=key, history_window=history_window, query_history_result_limit=result_limit, config=config, ) if resource_metrics: return resource_metrics logger.warning( "Metadata not found for session_id '{}', query_id '{}'\n" "Query used to search for resource usage: '{}'".format( session_id, query_id, sf_query)) result_limit = min( result_limit * RESULT_LIMIT_INC, config.query_history_result_limit_max_value, ) logger.info( "Extending QUERY_HISTORY() search window: RESULT_LIMIT={}". format(result_limit)) tries += 1 if retry_pause and retry_pause > 0: logger.info("Sleeping for %s seconds", retry_pause) sleep(retry_pause) else: logger.info( "No more retries left to fetch Snoflake query resources. Giving up." ) except Exception as exc: logger.exception( "Failed to log_snowflake_resource_usage (query_text=%s, connection_string=%s)\n" "Last query params used to search for resource usage: query_id - '%s', " "sesion_id = '%s', database - '%s', connection - '%s', query - '%s'", query_id, session_id, database, self, sf_query, ) if raise_on_error: raise logger.error( "Resource metrics were not found for query_id '%s'.\n Query used: %s", query_id, sf_query, ) return { f"{key}.warning": "No resources info found", # converting to str, since can be too large for DB int f"{key}.session_id": str(session_id), f"{key}.query_id": query_id, } def _query_resource_usage( self, database, # type: str query_id, # type: str session_id, # type: Optional[int] key, # type: Optional[str] history_window, # type: float query_history_result_limit, # type: int config, # type: SnowflakeConfig ): # type: (...) -> Tuple[Dict, str] key = key or "snowflake_query" query_history = self._build_resource_usage_query( database, query_id=query_id, session_id=session_id, history_window=history_window, query_history_result_limit=query_history_result_limit, config=config, ) result = self.query(query_history) if not result: return {}, query_history metrics = result[0] metrics_to_log = {} for metric, ui_name in SNOWFLAKE_METRIC_TO_UI_NAME.items(): if metric in metrics: value = metrics[metric] # Quick hack to track decimal values. probably should be handled on a serialization level if isinstance(value, Decimal): value = float(value) metrics_to_log[key + "." + ui_name] = value return metrics_to_log, query_history def _build_resource_usage_query( self, database: str, query_id: str, session_id: int, history_window: float, query_history_result_limit: int, config: SnowflakeConfig, ) -> str: time_end = utcnow() - timedelta( minutes=config.query_history_end_time_range_end) time_start = time_end - timedelta(minutes=history_window or config. query_history_end_time_range_start) if session_id: query_history = dedent("""\ select {metrics} from table({database}.information_schema.query_history_by_session( SESSION_ID => {session_id}, END_TIME_RANGE_START => '{time_start}'::timestamp_ltz, END_TIME_RANGE_END => '{time_end}'::timestamp_ltz, RESULT_LIMIT => {result_limit} )) where query_id='{query_id}' order by start_time desc limit 1;""").format( metrics=RESOURCE_METRICS, database=database, minutes=history_window, session_id=session_id, result_limit=query_history_result_limit, time_start=time_start, time_end=time_end, query_id=query_id, ) return query_history else: query_history = dedent("""\ select {metrics} from table({database}.information_schema.query_history( END_TIME_RANGE_START => '{time_start}'::timestamp_ltz, END_TIME_RANGE_END => '{time_end}'::timestamp_ltz, RESULT_LIMIT => {result_limit} )) where query_id='{query_id}' order by start_time desc limit 1;""").format( metrics=RESOURCE_METRICS, database=database, minutes=history_window, result_limit=query_history_result_limit, time_start=time_start, time_end=time_end, query_id=query_id, ) return query_history
def cursor(self, as_dict=False): cursor_class = MillisLoggingCursor if as_dict: cursor_class = MillisLoggingDictCursor return SnowflakeConnection.cursor(self, cursor_class)
def run_with_cursor( cnx: SnowflakeConnection, sql: str ) -> tuple[SnowflakeCursor, list[tuple] | list[dict]]: sql = sql.format(name=db_parameters["name"]) c = cnx.cursor(DictCursor) return c, c.execute(sql).fetchall()
def run(cnx: SnowflakeConnection, sql: str) -> list[tuple]: sql = sql.format(name=db_parameters["name"]) return cnx.cursor().execute(sql).fetchall()
def write_pandas( conn: SnowflakeConnection, df: pd.DataFrame, table_name: str, database: Optional[str] = None, schema: Optional[str] = None, chunk_size: Optional[int] = None, compression: str = "gzip", on_error: str = "abort_statement", parallel: int = 4, quote_identifiers: bool = True, auto_create_table: bool = False, create_temp_table: bool = False, ): """Allows users to most efficiently write back a pandas DataFrame to Snowflake. It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested with all of the COPY INTO command's output for debugging purposes. Example usage: import pandas from snowflake.connector.pandas_tools import write_pandas df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance']) success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers') Args: conn: Connection to be used to communicate with Snowflake. df: Dataframe we'd like to write back. table_name: Table name where we want to insert into. database: Database schema and table is in, if not provided the default one will be used (Default value = None). schema: Schema table is in, if not provided the default one will be used (Default value = None). chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once (Default value = None). compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip'). on_error: Action to take when COPY INTO statements fail, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions (Default value = 'abort_statement'). parallel: Number of threads to be used when uploading chunks, default follows documentation at: https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4). quote_identifiers: By default, identifiers, specifically database, schema, table and column names (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting. I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True) auto_create_table: When true, will automatically create a table with corresponding columns for each column in the passed in DataFrame. The table will not be created if it already exists create_temp_table: Will make the auto-created table as a temporary table """ if database is not None and schema is None: raise ProgrammingError( "Schema has to be provided to write_pandas when a database is provided" ) # This dictionary maps the compression algorithm to Snowflake put copy into command type # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet compression_map = {"gzip": "auto", "snappy": "snappy"} if compression not in compression_map.keys(): raise ProgrammingError( "Invalid compression '{}', only acceptable values are: {}".format( compression, compression_map.keys())) if quote_identifiers: location = ((('"' + database + '".') if database else "") + (('"' + schema + '".') if schema else "") + ('"' + table_name + '"')) else: location = ((database + "." if database else "") + (schema + "." if schema else "") + (table_name)) if chunk_size is None: chunk_size = len(df) cursor: SnowflakeCursor = conn.cursor() stage_name = create_temporary_sfc_stage(cursor) with TemporaryDirectory() as tmp_folder: for i, chunk in chunk_helper(df, chunk_size): chunk_path = os.path.join(tmp_folder, "file{}.txt".format(i)) # Dump chunk into parquet file chunk.to_parquet( chunk_path, compression=compression, use_deprecated_int96_timestamps=True, ) # Upload parquet file upload_sql = ( "PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ " "'file://{path}' @\"{stage_name}\" PARALLEL={parallel}" ).format( path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"), stage_name=stage_name, parallel=parallel, ) logger.debug(f"uploading files with '{upload_sql}'") cursor.execute(upload_sql, _is_internal=True) # Remove chunk file os.remove(chunk_path) if quote_identifiers: columns = '"' + '","'.join(list(df.columns)) + '"' else: columns = ",".join(list(df.columns)) if auto_create_table: file_format_name = create_file_format(compression, compression_map, cursor) infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))" logger.debug(f"inferring schema with '{infer_schema_sql}'") result_cursor = cursor.execute(infer_schema_sql, _is_internal=True) if result_cursor is None: raise SnowflakeQueryUnknownError(infer_schema_sql) result = cast(List[Tuple[str, str]], result_cursor.fetchall()) column_type_mapping: Dict[str, str] = dict(result) # Infer schema can return the columns out of order depending on the chunking we do when uploading # so we have to iterate through the dataframe columns to make sure we create the table with its # columns in order quote = '"' if quote_identifiers else "" create_table_columns = ", ".join([ f"{quote}{c}{quote} {column_type_mapping[c]}" for c in df.columns ]) create_table_sql = ( f"CREATE {'TEMP ' if create_temp_table else ''}TABLE IF NOT EXISTS {location} " f"({create_table_columns})" f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ ") logger.debug(f"auto creating table with '{create_table_sql}'") cursor.execute(create_table_sql, _is_internal=True) drop_file_format_sql = f"DROP FILE FORMAT IF EXISTS {file_format_name}" logger.debug(f"dropping file format with '{drop_file_format_sql}'") cursor.execute(drop_file_format_sql, _is_internal=True) # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html) if quote_identifiers: parquet_columns = "$1:" + ",$1:".join(f'"{c}"' for c in df.columns) else: parquet_columns = "$1:" + ",$1:".join(df.columns) copy_into_sql = ( "COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ " "({columns}) " 'FROM (SELECT {parquet_columns} FROM @"{stage_name}") ' "FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) " "PURGE=TRUE ON_ERROR={on_error}").format( location=location, columns=columns, parquet_columns=parquet_columns, stage_name=stage_name, compression=compression_map[compression], on_error=on_error, ) logger.debug("copying into with '{}'".format(copy_into_sql)) # Snowflake returns the original cursor if the query execution succeeded. result_cursor = cursor.execute(copy_into_sql, _is_internal=True) if result_cursor is None: raise SnowflakeQueryUnknownError(copy_into_sql) result_cursor.close()
def execute_snowflake_statement(conn: SnowflakeConnection, query) -> SnowflakeCursor: cursor = conn.cursor().execute(query) if cursor is None: raise SnowflakeQueryUnknownError(query) return cursor