def __init__(self, **kwargs):
        self.LOGGER = singer.get_logger()

        self.configured_warehouse = kwargs.get('warehouse')
        self.configured_database = kwargs.get('database')
        self.configured_schema = kwargs.get('schema')

        SnowflakeConnection.__init__(self, **kwargs)
Beispiel #2
0
    def __enter__(self):
        if self._connection is None:
            self._connection = SnowflakeConnection(
                user=self.user,
                password=self.password,
                account=self.account,
                session_parameters={"QUERY_TAG": self._query_tag},
                application="DBND Snowflake plugin {}".format(__version__),
            )
            # if and only if we opened connection - we should close it
            self._should_close = True

        return self
Beispiel #3
0
 def patch_connection(
     self,
     con: SnowflakeConnection,
     propagate: bool = True,
 ) -> Generator[TelemetryCaptureHandler, None, None]:
     original_telemetry = con._telemetry
     new_telemetry = TelemetryCaptureHandler(
         original_telemetry,
         propagate,
     )
     con._telemetry = new_telemetry
     try:
         yield new_telemetry
     finally:
         con._telemetry = original_telemetry
Beispiel #4
0
    def _describe(
        self,
        connection: SnowflakeConnection,
        query: str,
    ) -> Dict[str, str]:
        description_start = timer()
        cursor = connection.cursor(DictCursor)
        describe_res = cursor.describe(query)

        description_time = timer() - description_start
        self.logger.info(
            f'[benchmark][snowflake] - description {description_time} seconds',
            extra={
                'benchmark': {
                    'operation': 'describe',
                    'description_time': description_time,
                    'connector': 'snowflake',
                    'query': query,
                    'result': json.dumps(describe_res),
                }
            },
        )
        res = {
            r.name: type_code_mapping.get(r.type_code)
            for r in describe_res
        }
        return res
Beispiel #5
0
    def report_operations(self, connection: SnowflakeConnection,
                          operations: List[SqlOperation]):
        if connection.is_closed():
            # already closed, cannot proceed (and probably already tracked)
            return
        # update the tables names
        operations = [op.evolve_table_name(connection) for op in operations]

        # looks for tables schemas
        tables = chain.from_iterable(op.tables for op in operations
                                     if not op.is_file)

        tables_schemas: Dict[str, DTypes] = {}
        for table in tables:
            table_schema = get_snowflake_table_schema(connection, table)
            if table_schema:
                tables_schemas[table] = table_schema

        operations: List[SqlOperation] = [
            op.evolve_schema(tables_schemas) for op in operations
        ]

        for op in operations:
            log_dataset_op(
                op_path=render_connection_path(connection, op, "snowflake"),
                op_type=op.op_type,
                success=op.success,
                data=op,
                with_schema=True,
                send_metrics=True,
            )
Beispiel #6
0
 def alive_function(conn: SnowflakeConnection) -> Any:
     logger.debug('Check Snowflake connection')
     if hasattr(conn, 'is_closed'):
         try:
             return not conn.is_closed()
         except Exception:
             raise TypeError('is_closed is not a function')
Beispiel #7
0
def snowflake_query(connection: SnowflakeConnection, query: str, params=None):
    try:
        with connection.cursor(DictCursor) as cursor:
            cursor.execute(query, params)
            result = cursor.fetchall()
            return result
    except Exception:
        logger.exception("Error occurred during querying Snowflake, query: %s",
                         query)
        raise
Beispiel #8
0
 def close_function(conn: SnowflakeConnection) -> None:
     logger.debug('Close Snowflake connection')
     if hasattr(conn, 'close'):
         try:
             close_start = timer()
             r = conn.close()
             close_end = timer()
             logger.info(
                 f'[benchmark][snowflake] - close {close_end - close_start} seconds',
                 extra={
                     'benchmark': {
                         'operation': 'close',
                         'execution_time': close_end - close_start,
                         'connector': 'snowflake',
                     }
                 },
             )
             return r
         except Exception:
             raise TypeError('close is not a function')
Beispiel #9
0
    def _execute_query_internal(
        self,
        connection: SnowflakeConnection,
        query: str,
        query_parameters: Optional[dict] = None,
    ) -> pd.DataFrame:
        execution_start = timer()
        cursor = connection.cursor(DictCursor)
        query_res = cursor.execute(query, query_parameters)

        query_generation_time = timer() - execution_start
        self.logger.info(
            f'[benchmark][snowflake] - execute {query_generation_time} seconds',
            extra={
                'benchmark': {
                    'operation': 'execute',
                    'query_generation_time': query_generation_time,
                    'connector': 'snowflake',
                    'query': query,
                }
            },
        )
        self.set_query_generation_time(query_generation_time)
        convert_start = timer()
        # Here call our customized fetch
        values = pd.DataFrame.from_dict(query_res.fetchall())

        data_conversion_time = timer() - convert_start
        self.logger.info(
            f'[benchmark][snowflake] - dataframe {data_conversion_time} seconds',
            extra={
                'benchmark': {
                    'operation': 'dataframe',
                    'data_conversion_time': data_conversion_time,
                    'connector': 'snowflake',
                }
            },
        )
        self.set_data_conversion_time(data_conversion_time)

        return values  # do not return metadata from now
Beispiel #10
0
class SnowflakeController:
    """ Interacts with Snowflake, queries it"""
    def __init__(self, connection_or_connection_string):
        # type: (Union[str, SnowflakeConnection]) -> SnowflakeController
        if isinstance(connection_or_connection_string, SnowflakeConnection):
            self._connection = connection_or_connection_string

            self.account = self._connection.account
            self.user = self._connection.user
            self.password = "******"
        else:
            conn_params = conn_str_to_conn_params(
                connection_or_connection_string)

            self.account = conn_params["account"]
            self.user = conn_params["user"]
            self.password = conn_params["password"]

            self._connection = None

        self._should_close = False
        self._query_tag = "dbnd-snowflake"
        self._column_types = None

    def __enter__(self):
        if self._connection is None:
            self._connection = SnowflakeConnection(
                user=self.user,
                password=self.password,
                account=self.account,
                session_parameters={"QUERY_TAG": self._query_tag},
                application="DBND Snowflake plugin {}".format(__version__),
            )
            # if and only if we opened connection - we should close it
            self._should_close = True

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self._should_close and self._connection is not None:
            self._connection.close()

    def to_preview(self, table):
        # type: (SnowflakeTable) -> str
        # snowflake.connector does not handle incorrect utf-8 data fetched from db,
        # hence this fillding with encode/decode
        column_types = self.get_column_types(table)
        columns = ",".join(
            'TRY_HEX_DECODE_STRING(HEX_ENCODE("{0}")) AS {0}'.format(column)
            for column in column_types.keys())

        rows = self.query(
            "select {0} from {1.db_with_schema}.{1.table_name} limit {1.preview_rows}"
            .format(columns, table))
        preview_table = tabulate(rows, headers="keys") + "\n..."
        return preview_table

    def get_dimensions(self, table):
        # type: (SnowflakeTable) -> Dict
        if table.schema:
            in_cond = "in schema {0.db_with_schema}".format(table)
        else:
            in_cond = "in database {0.database}".format(table)
        table_meta = self.query("SHOW TABLES LIKE '{0}' {1}".format(
            table.table_name, in_cond))
        if len(table_meta) != 1:
            raise SnowflakeError(
                "Snowflake table not found: '{0.table_name}', DB: '{0.database}'"
                .format(table))

        cols = len(self.get_column_types(table))
        return {
            "rows": table_meta[0]["rows"],
            "cols": cols,
            "bytes": table_meta[0]["bytes"],
        }

    def get_column_types(self, table):
        # type: (SnowflakeTable) -> Dict[str, str]
        if self._column_types is not None:
            return self._column_types

        query = dedent("""\
            SELECT column_name, data_type
            FROM {0.database}.information_schema.columns
            WHERE LOWER(table_name) = LOWER('{0.table_name}')""").format(table)
        if table.schema:
            query += " and LOWER(table_schema) = LOWER('{0.schema}')".format(
                table)
        results = self.query(query)
        if not results:
            raise SnowflakeError(
                "Table columns not found. Snowflake DB: '{0.database}', "
                "schema: {0.schema} table: '{0.table_name}'\n"
                "Query used: {1}".format(table, query))

        self._column_types = {
            row["COLUMN_NAME"]: row["DATA_TYPE"]
            for row in results
        }
        return self._column_types

    def query(self, query, params=None):
        try:
            with self._connection.cursor(DictCursor) as cursor:
                cursor.execute(query, params)
                result = cursor.fetchall()
                return result
        except Exception:
            logger.exception(
                "Error occurred during querying Snowflake, query: %s", query)
            raise

    def __str__(self):
        return "snowflake://{user}:***@{account}".format(account=self.account,
                                                         user=self.user)

    def get_resource_usage(
        self,
        database: str,
        query_id: str,
        session_id: Optional[int],
        key: str,
        history_window: float,
        query_history_result_limit: int,
        delay: int,
        retries: int,
        retry_pause: float,
        raise_on_error: bool,
        config: SnowflakeConfig,
    ) -> Dict:
        if delay > 0:
            logger.info("Delaying search in QUERY_HISTORY for %s seconds",
                        delay)
            sleep(delay)
        result_limit = min(query_history_result_limit,
                           config.query_history_result_limit_max_value)
        tries, sf_query = 0, ""
        try:
            while (result_limit <= config.query_history_result_limit_max_value
                   and tries <= retries):
                resource_metrics, sf_query = self._query_resource_usage(
                    database,
                    query_id=query_id,
                    session_id=session_id,
                    key=key,
                    history_window=history_window,
                    query_history_result_limit=result_limit,
                    config=config,
                )
                if resource_metrics:
                    return resource_metrics
                logger.warning(
                    "Metadata not found for session_id '{}', query_id '{}'\n"
                    "Query used to search for resource usage: '{}'".format(
                        session_id, query_id, sf_query))
                result_limit = min(
                    result_limit * RESULT_LIMIT_INC,
                    config.query_history_result_limit_max_value,
                )
                logger.info(
                    "Extending QUERY_HISTORY() search window: RESULT_LIMIT={}".
                    format(result_limit))
                tries += 1
                if retry_pause and retry_pause > 0:
                    logger.info("Sleeping for %s seconds", retry_pause)
                    sleep(retry_pause)
            else:
                logger.info(
                    "No more retries left to fetch Snoflake query resources. Giving up."
                )

        except Exception as exc:
            logger.exception(
                "Failed to log_snowflake_resource_usage (query_text=%s, connection_string=%s)\n"
                "Last query params used to search for resource usage: query_id - '%s', "
                "sesion_id = '%s', database - '%s', connection - '%s', query - '%s'",
                query_id,
                session_id,
                database,
                self,
                sf_query,
            )
            if raise_on_error:
                raise

        logger.error(
            "Resource metrics were not found for query_id '%s'.\n Query used: %s",
            query_id,
            sf_query,
        )
        return {
            f"{key}.warning": "No resources info found",
            # converting to str, since can be too large for DB int
            f"{key}.session_id": str(session_id),
            f"{key}.query_id": query_id,
        }

    def _query_resource_usage(
            self,
            database,  # type: str
            query_id,  # type: str
            session_id,  # type: Optional[int]
            key,  # type: Optional[str]
            history_window,  # type: float
            query_history_result_limit,  # type: int
            config,  # type: SnowflakeConfig
    ):  # type: (...) -> Tuple[Dict, str]
        key = key or "snowflake_query"
        query_history = self._build_resource_usage_query(
            database,
            query_id=query_id,
            session_id=session_id,
            history_window=history_window,
            query_history_result_limit=query_history_result_limit,
            config=config,
        )

        result = self.query(query_history)
        if not result:
            return {}, query_history

        metrics = result[0]

        metrics_to_log = {}
        for metric, ui_name in SNOWFLAKE_METRIC_TO_UI_NAME.items():
            if metric in metrics:
                value = metrics[metric]
                # Quick hack to track decimal values. probably should be handled on a serialization level
                if isinstance(value, Decimal):
                    value = float(value)
                metrics_to_log[key + "." + ui_name] = value
        return metrics_to_log, query_history

    def _build_resource_usage_query(
        self,
        database: str,
        query_id: str,
        session_id: int,
        history_window: float,
        query_history_result_limit: int,
        config: SnowflakeConfig,
    ) -> str:
        time_end = utcnow() - timedelta(
            minutes=config.query_history_end_time_range_end)
        time_start = time_end - timedelta(minutes=history_window or config.
                                          query_history_end_time_range_start)
        if session_id:
            query_history = dedent("""\
                select {metrics}
                from table({database}.information_schema.query_history_by_session(
                    SESSION_ID => {session_id},
                    END_TIME_RANGE_START => '{time_start}'::timestamp_ltz,
                    END_TIME_RANGE_END => '{time_end}'::timestamp_ltz,
                    RESULT_LIMIT => {result_limit}
                ))
                where query_id='{query_id}'
                order by start_time desc limit 1;""").format(
                metrics=RESOURCE_METRICS,
                database=database,
                minutes=history_window,
                session_id=session_id,
                result_limit=query_history_result_limit,
                time_start=time_start,
                time_end=time_end,
                query_id=query_id,
            )
            return query_history

        else:
            query_history = dedent("""\
                select {metrics}
                from table({database}.information_schema.query_history(
                    END_TIME_RANGE_START => '{time_start}'::timestamp_ltz,
                    END_TIME_RANGE_END => '{time_end}'::timestamp_ltz,
                    RESULT_LIMIT => {result_limit}
                ))
                where query_id='{query_id}'
                order by start_time desc limit 1;""").format(
                metrics=RESOURCE_METRICS,
                database=database,
                minutes=history_window,
                result_limit=query_history_result_limit,
                time_start=time_start,
                time_end=time_end,
                query_id=query_id,
            )
            return query_history
    def cursor(self, as_dict=False):
        cursor_class = MillisLoggingCursor
        if as_dict:
            cursor_class = MillisLoggingDictCursor

        return SnowflakeConnection.cursor(self, cursor_class)
Beispiel #12
0
 def run_with_cursor(
     cnx: SnowflakeConnection, sql: str
 ) -> tuple[SnowflakeCursor, list[tuple] | list[dict]]:
     sql = sql.format(name=db_parameters["name"])
     c = cnx.cursor(DictCursor)
     return c, c.execute(sql).fetchall()
Beispiel #13
0
 def run(cnx: SnowflakeConnection, sql: str) -> list[tuple]:
     sql = sql.format(name=db_parameters["name"])
     return cnx.cursor().execute(sql).fetchall()
Beispiel #14
0
def write_pandas(
    conn: SnowflakeConnection,
    df: pd.DataFrame,
    table_name: str,
    database: Optional[str] = None,
    schema: Optional[str] = None,
    chunk_size: Optional[int] = None,
    compression: str = "gzip",
    on_error: str = "abort_statement",
    parallel: int = 4,
    quote_identifiers: bool = True,
    auto_create_table: bool = False,
    create_temp_table: bool = False,
):
    """Allows users to most efficiently write back a pandas DataFrame to Snowflake.

    It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table.

    Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested
    with all of the COPY INTO command's output for debugging purposes.

        Example usage:
            import pandas
            from snowflake.connector.pandas_tools import write_pandas

            df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance'])
            success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers')

    Args:
        conn: Connection to be used to communicate with Snowflake.
        df: Dataframe we'd like to write back.
        table_name: Table name where we want to insert into.
        database: Database schema and table is in, if not provided the default one will be used (Default value = None).
        schema: Schema table is in, if not provided the default one will be used (Default value = None).
        chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once
            (Default value = None).
        compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a
            better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip').
        on_error: Action to take when COPY INTO statements fail, default follows documentation at:
            https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions
            (Default value = 'abort_statement').
        parallel: Number of threads to be used when uploading chunks, default follows documentation at:
            https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4).
        quote_identifiers: By default, identifiers, specifically database, schema, table and column names
            (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting.
            I.e. identifiers will be coerced to uppercase by Snowflake.  (Default value = True)
        auto_create_table: When true, will automatically create a table with corresponding columns for each column in
            the passed in DataFrame. The table will not be created if it already exists
        create_temp_table: Will make the auto-created table as a temporary table
    """
    if database is not None and schema is None:
        raise ProgrammingError(
            "Schema has to be provided to write_pandas when a database is provided"
        )
    # This dictionary maps the compression algorithm to Snowflake put copy into command type
    # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet
    compression_map = {"gzip": "auto", "snappy": "snappy"}
    if compression not in compression_map.keys():
        raise ProgrammingError(
            "Invalid compression '{}', only acceptable values are: {}".format(
                compression, compression_map.keys()))
    if quote_identifiers:
        location = ((('"' + database + '".') if database else "") +
                    (('"' + schema + '".') if schema else "") +
                    ('"' + table_name + '"'))
    else:
        location = ((database + "." if database else "") +
                    (schema + "." if schema else "") + (table_name))
    if chunk_size is None:
        chunk_size = len(df)
    cursor: SnowflakeCursor = conn.cursor()
    stage_name = create_temporary_sfc_stage(cursor)

    with TemporaryDirectory() as tmp_folder:
        for i, chunk in chunk_helper(df, chunk_size):
            chunk_path = os.path.join(tmp_folder, "file{}.txt".format(i))
            # Dump chunk into parquet file
            chunk.to_parquet(
                chunk_path,
                compression=compression,
                use_deprecated_int96_timestamps=True,
            )
            # Upload parquet file
            upload_sql = (
                "PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
                "'file://{path}' @\"{stage_name}\" PARALLEL={parallel}"
            ).format(
                path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"),
                stage_name=stage_name,
                parallel=parallel,
            )
            logger.debug(f"uploading files with '{upload_sql}'")
            cursor.execute(upload_sql, _is_internal=True)
            # Remove chunk file
            os.remove(chunk_path)
    if quote_identifiers:
        columns = '"' + '","'.join(list(df.columns)) + '"'
    else:
        columns = ",".join(list(df.columns))

    if auto_create_table:
        file_format_name = create_file_format(compression, compression_map,
                                              cursor)
        infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))"
        logger.debug(f"inferring schema with '{infer_schema_sql}'")
        result_cursor = cursor.execute(infer_schema_sql, _is_internal=True)
        if result_cursor is None:
            raise SnowflakeQueryUnknownError(infer_schema_sql)
        result = cast(List[Tuple[str, str]], result_cursor.fetchall())
        column_type_mapping: Dict[str, str] = dict(result)
        # Infer schema can return the columns out of order depending on the chunking we do when uploading
        # so we have to iterate through the dataframe columns to make sure we create the table with its
        # columns in order
        quote = '"' if quote_identifiers else ""
        create_table_columns = ", ".join([
            f"{quote}{c}{quote} {column_type_mapping[c]}" for c in df.columns
        ])
        create_table_sql = (
            f"CREATE {'TEMP ' if create_temp_table else ''}TABLE IF NOT EXISTS {location} "
            f"({create_table_columns})"
            f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ ")
        logger.debug(f"auto creating table with '{create_table_sql}'")
        cursor.execute(create_table_sql, _is_internal=True)
        drop_file_format_sql = f"DROP FILE FORMAT IF EXISTS {file_format_name}"
        logger.debug(f"dropping file format with '{drop_file_format_sql}'")
        cursor.execute(drop_file_format_sql, _is_internal=True)

    # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly
    # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html)
    if quote_identifiers:
        parquet_columns = "$1:" + ",$1:".join(f'"{c}"' for c in df.columns)
    else:
        parquet_columns = "$1:" + ",$1:".join(df.columns)
    copy_into_sql = (
        "COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
        "({columns}) "
        'FROM (SELECT {parquet_columns} FROM @"{stage_name}") '
        "FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) "
        "PURGE=TRUE ON_ERROR={on_error}").format(
            location=location,
            columns=columns,
            parquet_columns=parquet_columns,
            stage_name=stage_name,
            compression=compression_map[compression],
            on_error=on_error,
        )
    logger.debug("copying into with '{}'".format(copy_into_sql))
    # Snowflake returns the original cursor if the query execution succeeded.
    result_cursor = cursor.execute(copy_into_sql, _is_internal=True)
    if result_cursor is None:
        raise SnowflakeQueryUnknownError(copy_into_sql)
    result_cursor.close()
Beispiel #15
0
def execute_snowflake_statement(conn: SnowflakeConnection,
                                query) -> SnowflakeCursor:
    cursor = conn.cursor().execute(query)
    if cursor is None:
        raise SnowflakeQueryUnknownError(query)
    return cursor