def test_get_username( app_context: AppContext, mocker: MockFixture, username: Optional[str], ) -> None: mock_g = mocker.patch("superset.utils.core.g", spec={}) mock_g.user = security_manager.find_user(username) assert get_username() == username
def mutate_query_from_config(self, sql): """Apply config's SQL_QUERY_MUTATOR Typically adds comments to the query with context""" SQL_QUERY_MUTATOR = config.get('SQL_QUERY_MUTATOR') if SQL_QUERY_MUTATOR: username = utils.get_username() sql = SQL_QUERY_MUTATOR(sql, username, security_manager, self.database) return sql
def mutate_query_from_config(self, sql: str) -> str: """Apply config's SQL_QUERY_MUTATOR Typically adds comments to the query with context""" SQL_QUERY_MUTATOR = config["SQL_QUERY_MUTATOR"] if SQL_QUERY_MUTATOR: username = utils.get_username() sql = SQL_QUERY_MUTATOR(sql, username, security_manager, self.database) return sql
def _log_query(sql: str) -> None: if log_query: log_query( engine.url, sql, schema, get_username(), __name__, security_manager, )
def get_engine( cls, database: "Database", schema: Optional[str] = None, source: Optional[str] = None, ) -> Engine: user_name = utils.get_username() return database.get_sqla_engine( schema=schema, nullpool=True, user_name=user_name, source=source )
def get_effective_user(self, object_url: URL) -> Optional[str]: """ Get the effective user, especially during impersonation. :param object_url: SQL Alchemy URL object :return: The effective username """ return ( # pylint: disable=used-before-assignment username if (username := get_username()) else object_url.username if self.impersonate_user else None)
def get_df(self, sql, schema): sqls = [str(s).strip().strip(';') for s in sqlparse.parse(sql)] source_key = None if request and request.referrer: if '/superset/dashboard/' in request.referrer: source_key = 'dashboard' elif '/superset/explore/' in request.referrer: source_key = 'chart' engine = self.get_sqla_engine( schema=schema, source=utils.sources.get(source_key, None)) username = utils.get_username() def needs_conversion(df_series): if df_series.empty: return False if isinstance(df_series[0], (list, dict)): return True return False def _log_query(sql): if log_query: log_query(engine.url, sql, schema, username, __name__, security_manager) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: st_seconds = datetime.now() for sql in sqls[:-1]: _log_query(sql) self.db_engine_spec.execute(cursor, sql) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) logging.debug('[PERFORMANCE CHECK] query response time from db {0} '.format(datetime.now()-st_seconds)) st_seconds = datetime.now() if cursor.description is not None: columns = [col_desc[0] for col_desc in cursor.description] else: columns = [] df = pd.DataFrame.from_records( data=list(cursor.fetchall()), columns=columns, coerce_float=True, ) for k, v in df.dtypes.items(): if v.type == numpy.object_ and needs_conversion(df[k]): df[k] = df[k].apply(utils.json_dumps_w_dates) logging.debug('[PERFORMANCE CHECK] pandas data frame formation time after response {0} '.format(datetime.now()-st_seconds)) return df
def get_df( # pylint: disable=too-many-locals self, sql: str, schema: Optional[str] = None, mutator: Optional[Callable] = None) -> pd.DataFrame: sqls = [str(s).strip(" ;") for s in sqlparse.parse(sql)] source_key = None if request and request.referrer: if "/superset/dashboard/" in request.referrer: source_key = "dashboard" elif "/superset/explore/" in request.referrer: source_key = "chart" engine = self.get_sqla_engine( schema=schema, source=utils.sources[source_key] if source_key else None) username = utils.get_username() def needs_conversion(df_series: pd.Series) -> bool: return not df_series.empty and isinstance(df_series[0], (list, dict)) def _log_query(sql: str) -> None: if log_query: log_query(engine.url, sql, schema, username, __name__, security_manager) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: for sql_ in sqls[:-1]: _log_query(sql_) self.db_engine_spec.execute(cursor, sql_) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) if cursor.description is not None: columns = [col_desc[0] for col_desc in cursor.description] else: columns = [] df = pd.DataFrame.from_records(data=list(cursor.fetchall()), columns=columns, coerce_float=True) if mutator: mutator(df) for k, v in df.dtypes.items(): if v.type == numpy.object_ and needs_conversion(df[k]): df[k] = df[k].apply(utils.json_dumps_w_dates) return df
def get_df(self, sql, schema, mutator=None): sqls = [str(s).strip().strip(";") for s in sqlparse.parse(sql)] source_key = None if request and request.referrer: if "/metrix/dashboard/" in request.referrer: source_key = "dashboard" elif "/metrix/explore/" in request.referrer: source_key = "chart" engine = self.get_sqla_engine(schema=schema, source=utils.sources.get( source_key, None)) username = utils.get_username() def needs_conversion(df_series): if df_series.empty: return False if isinstance(df_series[0], (list, dict)): return True return False def _log_query(sql): if log_query: log_query(engine.url, sql, schema, username, __name__, security_manager) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: for sql in sqls[:-1]: _log_query(sql) self.db_engine_spec.execute(cursor, sql) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) if cursor.description is not None: columns = [col_desc[0] for col_desc in cursor.description] else: columns = [] df = pd.DataFrame.from_records(data=list(cursor.fetchall()), columns=columns, coerce_float=True) if mutator: df = mutator(df) for k, v in df.dtypes.items(): if v.type == numpy.object_ and needs_conversion(df[k]): df[k] = df[k].apply(utils.json_dumps_w_dates) return df
def get_df(self, sql, schema, mutator=None): sqls = [str(s).strip().strip(';') for s in sqlparse.parse(sql)] source_key = None if request and request.referrer: if '/superset/dashboard/' in request.referrer: source_key = 'dashboard' elif '/superset/explore/' in request.referrer: source_key = 'chart' engine = self.get_sqla_engine( schema=schema, source=utils.sources.get(source_key, None)) username = utils.get_username() def needs_conversion(df_series): if df_series.empty: return False if isinstance(df_series[0], (list, dict)): return True return False def _log_query(sql): if log_query: log_query(engine.url, sql, schema, username, __name__, security_manager) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: for sql in sqls[:-1]: _log_query(sql) self.db_engine_spec.execute(cursor, sql) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) if cursor.description is not None: columns = [col_desc[0] for col_desc in cursor.description] else: columns = [] df = pd.DataFrame.from_records( data=list(cursor.fetchall()), columns=columns, coerce_float=True, ) if mutator: df = mutator(df) for k, v in df.dtypes.items(): if v.type == numpy.object_ and needs_conversion(df[k]): df[k] = df[k].apply(utils.json_dumps_w_dates) return df
def _get_sql_results( self, execution_context: SqlJsonExecutionContext, rendered_query: str, log_params: Optional[Dict[str, Any]], ) -> Optional[SqlResults]: return self._get_sql_results_task( execution_context.query.id, rendered_query, return_results=True, store_results=self._is_store_results(execution_context), username=get_username(), expand_data=execution_context.expand_data, log_params=log_params, )
def get_df( # pylint: disable=too-many-locals self, sql: str, schema: Optional[str] = None, mutator: Optional[Callable[[pd.DataFrame], None]] = None, username: Optional[str] = None, ) -> pd.DataFrame: sqls = self.db_engine_spec.parse_sql(sql) engine = self.get_sqla_engine(schema=schema, user_name=username) username = utils.get_username() or username def needs_conversion(df_series: pd.Series) -> bool: return ( not df_series.empty and isinstance(df_series, pd.Series) and isinstance(df_series[0], (list, dict)) ) def _log_query(sql: str) -> None: if log_query: log_query(engine.url, sql, schema, username, __name__, security_manager) with closing(engine.raw_connection()) as conn: cursor = conn.cursor() for sql_ in sqls[:-1]: _log_query(sql_) self.db_engine_spec.execute(cursor, sql_) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) data = self.db_engine_spec.fetch_data(cursor) result_set = SupersetResultSet( data, cursor.description, self.db_engine_spec ) df = result_set.to_pandas_df() if mutator: df = mutator(df) for col, coltype in df.dtypes.to_dict().items(): if coltype == numpy.object_ and needs_conversion(df[col]): df[col] = df[col].apply(utils.json_dumps_w_dates) return df
def execute( self, execution_context: SqlJsonExecutionContext, rendered_query: str, log_params: Optional[Dict[str, Any]], ) -> SqlJsonExecutionStatus: query_id = execution_context.query.id logger.info("Query %i: Running query on a Celery worker", query_id) try: task = self._get_sql_results_task.delay( # type: ignore query_id, rendered_query, return_results=False, store_results=not execution_context.select_as_cta, username=get_username(), start_time=now_as_float(), expand_data=execution_context.expand_data, log_params=log_params, ) try: task.forget() except NotImplementedError: logger.warning( "Unable to forget Celery task as backend" "does not support this operation" ) except Exception as ex: logger.exception("Query %i: %s", query_id, str(ex)) message = __("Failed to start remote query on a worker.") error = SupersetError( message=message, error_type=SupersetErrorType.ASYNC_WORKERS_ERROR, level=ErrorLevel.ERROR, ) error_payload = dataclasses.asdict(error) query = execution_context.query query.set_extra_json_key("errors", [error_payload]) query.status = QueryStatus.FAILED query.error_message = message raise SupersetErrorException(error) from ex self._query_dao.update_saved_query_exec_info(query_id) return SqlJsonExecutionStatus.QUERY_IS_RUNNING
def get_df(self, sql, schema): sqls = [str(s).strip().strip(';') for s in sqlparse.parse(sql)] source_key = None if request and request.referrer: if '/superset/dashboard/' in request.referrer: source_key = 'dashboard' elif '/superset/explore/' in request.referrer: source_key = 'chart' engine = self.get_sqla_engine( schema=schema, source=utils.sources.get(source_key, None)) username = utils.get_username() def needs_conversion(df_series): if df_series.empty: return False if isinstance(df_series[0], (list, dict)): return True return False with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: for sql in sqls[:-1]: self.db_engine_spec.execute(cursor, sql) cursor.fetchall() self.db_engine_spec.execute(cursor, sqls[-1]) if cursor.description is not None: columns = [col_desc[0] for col_desc in cursor.description] else: columns = [] df = pd.DataFrame.from_records( data=list(cursor.fetchall()), columns=columns, coerce_float=True, ) for k, v in df.dtypes.items(): if v.type == numpy.object_ and needs_conversion(df[k]): df[k] = df[k].apply(utils.json_dumps_w_dates) return df
def get_df(self, sql, schema): sqls = [str(s).strip().strip(';') for s in sqlparse.parse(sql)] engine = self.get_sqla_engine(schema=schema) username = utils.get_username() def needs_conversion(df_series): if df_series.empty: return False if isinstance(df_series[0], (list, dict)): return True return False def _log_query(sql): if log_query: log_query(engine.url, sql, schema, username, __name__, security_manager) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: for sql in sqls[:-1]: _log_query(sql) self.db_engine_spec.execute(cursor, sql) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) if cursor.description is not None: columns = [col_desc[0] for col_desc in cursor.description] else: columns = [] df = pd.DataFrame.from_records( data=list(cursor.fetchall()), columns=columns, coerce_float=True, ) for k, v in df.dtypes.items(): if v.type == numpy.object_ and needs_conversion(df[k]): df[k] = df[k].apply(utils.json_dumps_w_dates) return df
def execute_sql_statement( # pylint: disable=too-many-arguments,too-many-statements sql_statement: str, query: Query, session: Session, cursor: Any, log_params: Optional[Dict[str, Any]], apply_ctas: bool = False, ) -> SupersetResultSet: """Executes a single SQL statement""" database: Database = query.database db_engine_spec = database.db_engine_spec parsed_query = ParsedQuery(sql_statement) if is_feature_enabled("RLS_IN_SQLLAB"): # Insert any applicable RLS predicates parsed_query = ParsedQuery( str( insert_rls( parsed_query._parsed[0], # pylint: disable=protected-access database.id, query.schema, ))) sql = parsed_query.stripped() # This is a test to see if the query is being # limited by either the dropdown or the sql. # We are testing to see if more rows exist than the limit. increased_limit = None if query.limit is None else query.limit + 1 if not db_engine_spec.is_readonly_query( parsed_query) and not database.allow_dml: raise SupersetErrorException( SupersetError( message=__( "Only SELECT statements are allowed against this database." ), error_type=SupersetErrorType.DML_NOT_ALLOWED_ERROR, level=ErrorLevel.ERROR, )) if apply_ctas: if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = "tmp_{}_table_{}".format( query.user_id, start_dttm.strftime("%Y_%m_%d_%H_%M_%S")) sql = parsed_query.as_create_table( query.tmp_table_name, schema_name=query.tmp_schema_name, method=query.ctas_method, ) query.select_as_cta_used = True # Do not apply limit to the CTA queries when SQLLAB_CTAS_NO_LIMIT is set to true if db_engine_spec.is_select_query(parsed_query) and not ( query.select_as_cta_used and SQLLAB_CTAS_NO_LIMIT): if SQL_MAX_ROW and (not query.limit or query.limit > SQL_MAX_ROW): query.limit = SQL_MAX_ROW sql = apply_limit_if_exists(database, increased_limit, query, sql) # Hook to allow environment-specific mutation (usually comments) to the SQL sql = SQL_QUERY_MUTATOR( sql, user_name=get_username(), # TODO(john-bodley): Deprecate in 3.0. security_manager=security_manager, database=database, ) try: query.executed_sql = sql if log_query: log_query( query.database.sqlalchemy_uri, query.executed_sql, query.schema, get_username(), __name__, security_manager, log_params, ) session.commit() with stats_timing("sqllab.query.time_executing_query", stats_logger): logger.debug("Query %d: Running query: %s", query.id, sql) db_engine_spec.execute(cursor, sql, async_=True) logger.debug("Query %d: Handling cursor", query.id) db_engine_spec.handle_cursor(cursor, query, session) with stats_timing("sqllab.query.time_fetching_results", stats_logger): logger.debug( "Query %d: Fetching data for query object: %s", query.id, str(query.to_dict()), ) data = db_engine_spec.fetch_data(cursor, increased_limit) if query.limit is None or len(data) <= query.limit: query.limiting_factor = LimitingFactor.NOT_LIMITED else: # return 1 row less than increased_query data = data[:-1] except SoftTimeLimitExceeded as ex: query.status = QueryStatus.TIMED_OUT logger.warning("Query %d: Time limit exceeded", query.id) logger.debug("Query %d: %s", query.id, ex) raise SupersetErrorException( SupersetError( message=__( "The query was killed after %(sqllab_timeout)s seconds. It might " "be too complex, or the database might be under heavy load.", sqllab_timeout=SQLLAB_TIMEOUT, ), error_type=SupersetErrorType.SQLLAB_TIMEOUT_ERROR, level=ErrorLevel.ERROR, )) from ex except Exception as ex: # query is stopped in another thread/worker # stopping raises expected exceptions which we should skip session.refresh(query) if query.status == QueryStatus.STOPPED: raise SqlLabQueryStoppedException() from ex logger.error("Query %d: %s", query.id, type(ex), exc_info=True) logger.debug("Query %d: %s", query.id, ex) raise SqlLabException(db_engine_spec.extract_error_message(ex)) from ex logger.debug("Query %d: Fetching cursor description", query.id) cursor_description = cursor.description return SupersetResultSet(data, cursor_description, db_engine_spec)
def get_engine(cls, database, schema=None, source=None): user_name = utils.get_username() return database.get_sqla_engine(schema=schema, nullpool=True, user_name=user_name, source=source)
def validate_statement( cls, statement: str, database: Database, cursor: Any, ) -> Optional[SQLValidationAnnotation]: # pylint: disable=too-many-locals db_engine_spec = database.db_engine_spec parsed_query = ParsedQuery(statement) sql = parsed_query.stripped() # Hook to allow environment-specific mutation (usually comments) to the SQL sql_query_mutator = config["SQL_QUERY_MUTATOR"] if sql_query_mutator: sql = sql_query_mutator( sql, user_name=get_username(), # TODO(john-bodley): Deprecate in 3.0. security_manager=security_manager, database=database, ) # Transform the final statement to an explain call before sending it on # to presto to validate sql = f"EXPLAIN (TYPE VALIDATE) {sql}" # Invoke the query against presto. NB this deliberately doesn't use the # engine spec's handle_cursor implementation since we don't record # these EXPLAIN queries done in validation as proper Query objects # in the superset ORM. # pylint: disable=import-outside-toplevel from pyhive.exc import DatabaseError try: db_engine_spec.execute(cursor, sql) polled = cursor.poll() while polled: logger.info("polling presto for validation progress") stats = polled.get("stats", {}) if stats: state = stats.get("state") if state == "FINISHED": break time.sleep(0.2) polled = cursor.poll() db_engine_spec.fetch_data(cursor, MAX_ERROR_ROWS) return None except DatabaseError as db_error: # The pyhive presto client yields EXPLAIN (TYPE VALIDATE) responses # as though they were normal queries. In other words, it doesn't # know that errors here are not exceptional. To map this back to # ordinary control flow, we have to trap the category of exception # raised by the underlying client, match the exception arguments # pyhive provides against the shape of dictionary for a presto query # invalid error, and restructure that error as an annotation we can # return up. # If the first element in the DatabaseError is not a dictionary, but # is a string, return that message. if db_error.args and isinstance(db_error.args[0], str): raise PrestoSQLValidationError(db_error.args[0]) from db_error # Confirm the first element in the DatabaseError constructor is a # dictionary with error information. This is currently provided by # the pyhive client, but may break if their interface changes when # we update at some point in the future. if not db_error.args or not isinstance(db_error.args[0], dict): raise PrestoSQLValidationError( "The pyhive presto client returned an unhandled " "database error." ) from db_error error_args: Dict[str, Any] = db_error.args[0] # Confirm the two fields we need to be able to present an annotation # are present in the error response -- a message, and a location. if "message" not in error_args: raise PrestoSQLValidationError( "The pyhive presto client did not report an error message" ) from db_error if "errorLocation" not in error_args: # Pylint is confused about the type of error_args, despite the hints # and checks above. # pylint: disable=invalid-sequence-index message = error_args["message"] + "\n(Error location unknown)" # If we have a message but no error location, return the message and # set the location as the beginning. return SQLValidationAnnotation( message=message, line_number=1, start_column=1, end_column=1 ) # pylint: disable=invalid-sequence-index message = error_args["message"] err_loc = error_args["errorLocation"] line_number = err_loc.get("lineNumber", None) start_column = err_loc.get("columnNumber", None) end_column = err_loc.get("columnNumber", None) return SQLValidationAnnotation( message=message, line_number=line_number, start_column=start_column, end_column=end_column, ) except Exception as ex: logger.exception("Unexpected error running validation query: %s", str(ex)) raise ex
def test_get_username(app_context: AppContext, username: str) -> None: assert not hasattr(g, "user") assert get_username() is None g.user = security_manager.find_user(username) assert get_username() == username