def test_display_limit(self, mock_superset_db, mock_results_backend): query_mock = mock.Mock() query_mock.sql = "SELECT *" query_mock.database = 1 query_mock.schema = "superset" mock_superset_db.session.query().filter_by( ).one_or_none.return_value = (query_mock) data = [{"col_0": i} for i in range(100)] payload = { "status": utils.QueryStatus.SUCCESS, "query": { "rows": 100 }, "data": data, } # do not apply msgpack serialization use_msgpack = app.config["RESULTS_BACKEND_USE_MSGPACK"] app.config["RESULTS_BACKEND_USE_MSGPACK"] = False serialized_payload = sql_lab._serialize_payload(payload, False) compressed = utils.zlib_compress(serialized_payload) mock_results_backend.get.return_value = compressed # get all results result = json.loads(self.get_resp("/superset/results/key/")) expected = {"status": "success", "query": {"rows": 100}, "data": data} self.assertEqual(result, expected) # limit results to 1 limited_data = data[:1] result = json.loads(self.get_resp("/superset/results/key/?rows=1")) expected = { "status": "success", "query": { "rows": 100 }, "data": limited_data, "displayLimitReached": True, } self.assertEqual(result, expected) app.config["RESULTS_BACKEND_USE_MSGPACK"] = use_msgpack
def execute_sql_statements( query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, expand_data=False, ): # pylint: disable=too-many-arguments, too-many-locals, too-many-statements """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing("sqllab.query.time_pending", now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if database.allow_run_async and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logger.info(f"Query {query_id}: Executing {len(statements)} statement(s)") logger.info(f"Query {query_id}: Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.commit() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get("sql_lab", None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # Check if stopped query = get_query(query_id, session) if query.status == QueryStatus.STOPPED: return None # Run statement msg = f"Running statement {i+1} out of {statement_count}" logger.info(f"Query {query_id}: {msg}") query.set_extra_json_key("progress", msg) session.commit() try: cdf = execute_sql_statement(statement, query, user_name, session, cursor) except Exception as e: # pylint: disable=broad-except msg = str(e) if statement_count > 1: msg = f"[Statement {i+1} out of {statement_count}] " + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key("progress", None) if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False, ) query.end_time = now_as_float() data, selected_columns, all_columns, expanded_columns = _serialize_and_expand_data( cdf, db_engine_spec, store_results and results_backend_use_msgpack, expand_data) payload.update({ "status": QueryStatus.SUCCESS, "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query.to_dict(), }) payload["query"]["state"] = QueryStatus.SUCCESS if store_results and results_backend: key = str(uuid.uuid4()) logger.info( f"Query {query_id}: Storing results in results backend, key: {key}" ) with stats_timing("sqllab.query.results_backend_write", stats_logger): with stats_timing( "sqllab.query.results_backend_write_serialization", stats_logger): serialized_payload = _serialize_payload( payload, results_backend_use_msgpack) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config["CACHE_DEFAULT_TIMEOUT"] compressed = zlib_compress(serialized_payload) logger.debug( f"*** serialized payload size: {getsizeof(serialized_payload)}" ) logger.debug( f"*** compressed payload size: {getsizeof(compressed)}") results_backend.set(key, compressed, cache_timeout) query.results_key = key query.status = QueryStatus.SUCCESS session.commit() if return_results: return payload return None
def execute_sql( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing('sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() def handle_error(msg): """Local method handling error while processing the SQL""" troubleshooting_link = config['TROUBLESHOOTING_LINK'] query.error_message = msg query.status = QueryStatus.FAILED query.tmp_table_name = None session.commit() payload.update({ 'status': query.status, 'error': msg, }) if troubleshooting_link: payload['link'] = troubleshooting_link return payload if store_results and not results_backend: return handle_error("Results backend isn't configured.") # Limit enforced only for retrieving the data, not for the CTA queries. superset_query = SupersetQuery(rendered_query) executed_sql = superset_query.stripped() SQL_MAX_ROWS = app.config.get('SQL_MAX_ROW') if not superset_query.is_readonly() and not database.allow_dml: return handle_error( 'Only `SELECT` statements are allowed against this database') if query.select_as_cta: if not superset_query.is_select(): return handle_error( 'Only `SELECT` statements can be used with the CREATE TABLE ' 'feature.') if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = 'tmp_{}_table_{}'.format( query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S')) executed_sql = superset_query.as_create_table(query.tmp_table_name) query.select_as_cta_used = True if superset_query.is_select(): if SQL_MAX_ROWS and (not query.limit or query.limit > SQL_MAX_ROWS): query.limit = SQL_MAX_ROWS if query.limit: executed_sql = database.apply_limit_to_sql(executed_sql, query.limit) # Hook to allow environment-specific mutation (usually comments) to the SQL SQL_QUERY_MUTATOR = config.get('SQL_QUERY_MUTATOR') if SQL_QUERY_MUTATOR: executed_sql = SQL_QUERY_MUTATOR(executed_sql, user_name, security_manager, database) query.executed_sql = executed_sql query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.merge(query) session.commit() logging.info("Set query to 'running'") conn = None try: engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, ) conn = engine.raw_connection() cursor = conn.cursor() logging.info('Running query: \n{}'.format(executed_sql)) logging.info(query.executed_sql) query_start_time = now_as_float() if log_query: log_query( query.database.sqlalchemy_uri, query.executed_sql, query.schema, user_name, __name__, security_manager, ) db_engine_spec.execute(cursor, query.executed_sql, async_=True) logging.info('Handling cursor') db_engine_spec.handle_cursor(cursor, query, session) logging.info('Fetching data: {}'.format(query.to_dict())) stats_logger.timing('sqllab.query.time_executing_query', now_as_float() - query_start_time) fetching_start_time = now_as_float() data = db_engine_spec.fetch_data(cursor, query.limit) stats_logger.timing('sqllab.query.time_fetching_results', now_as_float() - fetching_start_time) except SoftTimeLimitExceeded as e: logging.exception(e) if conn is not None: conn.close() return handle_error( "SQL Lab timeout. This environment's policy is to kill queries " 'after {} seconds.'.format(SQLLAB_TIMEOUT)) except Exception as e: logging.exception(e) if conn is not None: conn.close() return handle_error(db_engine_spec.extract_error_message(e)) logging.info('Fetching cursor description') cursor_description = cursor.description if conn is not None: conn.commit() conn.close() if query.status == QueryStatus.STOPPED: return handle_error('The query has been stopped') cdf = dataframe.SupersetDataFrame(data, cursor_description, db_engine_spec) query.rows = cdf.size query.progress = 100 query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = '{}'.format( database.select_star(query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False)) query.end_time = now_as_float() session.merge(query) session.flush() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = '{}'.format(uuid.uuid4()) logging.info('Storing results in results backend, key: {}'.format(key)) write_to_results_backend_start = now_as_float() json_payload = json.dumps(payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key stats_logger.timing('sqllab.query.results_backend_write', now_as_float() - write_to_results_backend_start) session.merge(query) session.commit() if return_results: return payload
def test_zlib_compression(self): json_str = '{"test": 1}' blob = zlib_compress(json_str) got_str = zlib_decompress(blob) self.assertEqual(json_str, got_str)
def execute_sql_statements( # pylint: disable=too-many-arguments, too-many-locals, too-many-statements, too-many-branches query_id: int, rendered_query: str, return_results: bool, store_results: bool, user_name: Optional[str], session: Session, start_time: Optional[float], expand_data: bool, log_params: Optional[Dict[str, Any]], ) -> Optional[Dict[str, Any]]: """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing("sqllab.query.time_pending", now_as_float() - start_time) query = get_query(query_id, session) payload: Dict[str, Any] = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if database.allow_run_async and not results_backend: raise SupersetErrorException( SupersetError( message=__("Results backend is not configured."), error_type=SupersetErrorType.RESULTS_BACKEND_NOT_CONFIGURED_ERROR, level=ErrorLevel.ERROR, ) ) # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query, strip_comments=True) if not db_engine_spec.run_multiple_statements_as_one: statements = parsed_query.get_statements() logger.info( "Query %s: Executing %i statement(s)", str(query_id), len(statements) ) else: statements = [rendered_query] logger.info("Query %s: Executing query as a single statement", str(query_id)) logger.info("Query %s: Set query to 'running'", str(query_id)) query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.commit() # Should we create a table or view from the select? if ( query.select_as_cta and query.ctas_method == CtasMethod.TABLE and not parsed_query.is_valid_ctas() ): raise SupersetErrorException( SupersetError( message=__( "CTAS (create table as select) can only be run with a query where " "the last statement is a SELECT. Please make sure your query has " "a SELECT as its last statement. Then, try running your query " "again." ), error_type=SupersetErrorType.INVALID_CTAS_QUERY_ERROR, level=ErrorLevel.ERROR, ) ) if ( query.select_as_cta and query.ctas_method == CtasMethod.VIEW and not parsed_query.is_valid_cvas() ): raise SupersetErrorException( SupersetError( message=__( "CVAS (create view as select) can only be run with a query with " "a single SELECT statement. Please make sure your query has only " "a SELECT statement. Then, try running your query again." ), error_type=SupersetErrorType.INVALID_CVAS_QUERY_ERROR, level=ErrorLevel.ERROR, ) ) engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=QuerySource.SQL_LAB, ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: # closing the connection closes the cursor as well cursor = conn.cursor() statement_count = len(statements) for i, statement in enumerate(statements): # Check if stopped query = get_query(query_id, session) if query.status == QueryStatus.STOPPED: return None # For CTAS we create the table only on the last statement apply_ctas = query.select_as_cta and ( query.ctas_method == CtasMethod.VIEW or (query.ctas_method == CtasMethod.TABLE and i == len(statements) - 1) ) # Run statement msg = f"Running statement {i+1} out of {statement_count}" logger.info("Query %s: %s", str(query_id), msg) query.set_extra_json_key("progress", msg) session.commit() try: result_set = execute_sql_statement( statement, query, user_name, session, cursor, log_params, apply_ctas, ) except Exception as ex: # pylint: disable=broad-except msg = str(ex) prefix_message = ( f"[Statement {i+1} out of {statement_count}]" if statement_count > 1 else "" ) payload = handle_query_error( ex, query, session, payload, prefix_message ) return payload # Commit the connection so CTA queries will create the table. conn.commit() # Success, updating the query entry in database query.rows = result_set.size query.progress = 100 query.set_extra_json_key("progress", None) if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, schema=query.tmp_schema_name, limit=query.limit, show_cols=False, latest_partition=False, ) query.end_time = now_as_float() use_arrow_data = store_results and cast(bool, results_backend_use_msgpack) data, selected_columns, all_columns, expanded_columns = _serialize_and_expand_data( result_set, db_engine_spec, use_arrow_data, expand_data ) # TODO: data should be saved separately from metadata (likely in Parquet) payload.update( { "status": QueryStatus.SUCCESS, "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query.to_dict(), } ) payload["query"]["state"] = QueryStatus.SUCCESS if store_results and results_backend: key = str(uuid.uuid4()) logger.info( "Query %s: Storing results in results backend, key: %s", str(query_id), key ) with stats_timing("sqllab.query.results_backend_write", stats_logger): with stats_timing( "sqllab.query.results_backend_write_serialization", stats_logger ): serialized_payload = _serialize_payload( payload, cast(bool, results_backend_use_msgpack) ) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config["CACHE_DEFAULT_TIMEOUT"] compressed = zlib_compress(serialized_payload) logger.debug( "*** serialized payload size: %i", getsizeof(serialized_payload) ) logger.debug("*** compressed payload size: %i", getsizeof(compressed)) results_backend.set(key, compressed, cache_timeout) query.results_key = key query.status = QueryStatus.SUCCESS session.commit() if return_results: # since we're returning results we need to create non-arrow data if use_arrow_data: ( data, selected_columns, all_columns, expanded_columns, ) = _serialize_and_expand_data( result_set, db_engine_spec, False, expand_data ) payload.update( { "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, } ) return payload return None
def execute_sql_statements( # pylint: disable=too-many-arguments, too-many-locals, too-many-statements query_id: int, rendered_query: str, return_results: bool, store_results: bool, user_name: Optional[str], session: Session, start_time: Optional[float], expand_data: bool, log_params: Optional[Dict[str, Any]], ) -> Optional[Dict[str, Any]]: """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing("sqllab.query.time_pending", now_as_float() - start_time) query = get_query(query_id, session) payload: Dict[str, Any] = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if database.allow_run_async and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logger.info("Query %s: Executing %i statement(s)", str(query_id), len(statements)) logger.info("Query %s: Set query to 'running'", str(query_id)) query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.commit() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=QuerySource.SQL_LAB, ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # Check if stopped query = get_query(query_id, session) if query.status == QueryStatus.STOPPED: return None # Run statement msg = f"Running statement {i+1} out of {statement_count}" logger.info("Query %s: %s", str(query_id), msg) query.set_extra_json_key("progress", msg) session.commit() try: result_set = execute_sql_statement(statement, query, user_name, session, cursor, log_params) except Exception as ex: # pylint: disable=broad-except msg = str(ex) if statement_count > 1: msg = f"[Statement {i+1} out of {statement_count}] " + msg payload = handle_query_error(msg, query, session, payload) return payload # Commit the connection so CTA queries will create the table. conn.commit() # Success, updating the query entry in database query.rows = result_set.size query.progress = 100 query.set_extra_json_key("progress", None) if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, schema=query.tmp_schema_name, limit=query.limit, show_cols=False, latest_partition=False, ) query.end_time = now_as_float() use_arrow_data = store_results and cast(bool, results_backend_use_msgpack) data, selected_columns, all_columns, expanded_columns = _serialize_and_expand_data( result_set, db_engine_spec, use_arrow_data, expand_data) # TODO: data should be saved separately from metadata (likely in Parquet) payload.update({ "status": QueryStatus.SUCCESS, "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query.to_dict(), }) payload["query"]["state"] = QueryStatus.SUCCESS if store_results and results_backend: key = str(uuid.uuid4()) logger.info("Query %s: Storing results in results backend, key: %s", str(query_id), key) with stats_timing("sqllab.query.results_backend_write", stats_logger): with stats_timing( "sqllab.query.results_backend_write_serialization", stats_logger): serialized_payload = _serialize_payload( payload, cast(bool, results_backend_use_msgpack)) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config["CACHE_DEFAULT_TIMEOUT"] compressed = zlib_compress(serialized_payload) logger.debug("*** serialized payload size: %i", getsizeof(serialized_payload)) logger.debug("*** compressed payload size: %i", getsizeof(compressed)) results_backend.set(key, compressed, cache_timeout) query.results_key = key query.status = QueryStatus.SUCCESS session.commit() if return_results: # since we're returning results we need to create non-arrow data if use_arrow_data: ( data, selected_columns, all_columns, expanded_columns, ) = _serialize_and_expand_data(result_set, db_engine_spec, False, expand_data) payload.update({ "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, }) return payload return None
def execute_sql_statements( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing("sqllab.query.time_pending", now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if store_results and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logging.info(f"Executing {len(statements)} statement(s)") logging.info("Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get("sql_lab", None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: query.connection_id = db_engine_spec.get_connection_id(cursor) session.commit() statement_count = len(statements) for i, statement in enumerate(statements): # check if the query was stopped session.refresh(query) if query.status == QueryStatus.STOPPED: payload.update({"status": query.status}) return payload msg = f"Running statement {i+1} out of {statement_count}" logging.info(msg) query.set_extra_json_key("progress", msg) session.commit() try: cdf = execute_sql_statement( statement, query, user_name, session, cursor ) msg = f"Running statement {i+1} out of {statement_count}" except Exception as e: # query can be stopped in another thread/worker # but in synchronized mode it may lead to an error # skip error the error in such case session.refresh(query) if query.status == QueryStatus.STOPPED: payload.update({"status": query.status}) return payload msg = str(e) if statement_count > 1: msg = f"[Statement {i+1} out of {statement_count}] " + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key("progress", None) query.connection_id = None if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False, ) query.end_time = now_as_float() selected_columns = cdf.columns or [] data = cdf.data or [] all_columns, data, expanded_columns = db_engine_spec.expand_data( selected_columns, data ) payload.update( { "status": QueryStatus.SUCCESS, "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query.to_dict(), } ) payload["query"]["state"] = QueryStatus.SUCCESS # go over each row, find bytes columns that start with the magic UAST # sequence b'\x00bgr', and replace it with a string containing the # UAST in JSON for row in payload["data"]: for k, v in row.items(): if isinstance(v, bytes) and len(v) > 4 and v[0:4] == b"\x00bgr": try: ctx = decode(v, format=0) row[k] = json.dumps(ctx.load()) except Exception: pass if store_results: key = str(uuid.uuid4()) logging.info(f"Storing results in results backend, key: {key}") with stats_timing("sqllab.query.results_backend_write", stats_logger): json_payload = json.dumps( payload, default=json_iso_dttm_ser, ignore_nan=True ) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get("CACHE_DEFAULT_TIMEOUT", 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key query.status = QueryStatus.SUCCESS session.commit() if return_results: return payload
def execute_sql_statements( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing( 'sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if store_results and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logging.info(f'Executing {len(statements)} statement(s)') logging.info("Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # TODO CHECK IF STOPPED msg = f'Running statement {i+1} out of {statement_count}' logging.info(msg) query.set_extra_json_key('progress', msg) session.commit() is_last_statement = i == len(statements) - 1 try: cdf = execute_sql_statement( statement, query, user_name, session, cursor, return_results=is_last_statement and return_results) msg = f'Running statement {i+1} out of {statement_count}' except Exception as e: msg = str(e) if statement_count > 1: msg = f'[Statement {i+1} out of {statement_count}] ' + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key('progress', None) query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False) query.end_time = now_as_float() session.commit() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = str(uuid.uuid4()) logging.info(f'Storing results in results backend, key: {key}') with stats_timing('sqllab.query.results_backend_write', stats_logger): json_payload = json.dumps( payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key session.commit() if return_results: return payload
def execute_sql_statements( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing('sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if store_results and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logging.info(f'Executing {len(statements)} statement(s)') logging.info("Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get('sql_lab', None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # TODO CHECK IF STOPPED msg = f'Running statement {i+1} out of {statement_count}' logging.info(msg) query.set_extra_json_key('progress', msg) session.commit() try: cdf = execute_sql_statement(statement, query, user_name, session, cursor) msg = f'Running statement {i+1} out of {statement_count}' except Exception as e: msg = str(e) if statement_count > 1: msg = f'[Statement {i+1} out of {statement_count}] ' + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key('progress', None) query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False) query.end_time = now_as_float() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = str(uuid.uuid4()) logging.info(f'Storing results in results backend, key: {key}') with stats_timing('sqllab.query.results_backend_write', stats_logger): json_payload = json.dumps(payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key session.commit() if return_results: return payload
def test_zlib_compression(self): json_str = '{"test": 1}' blob = zlib_compress(json_str) got_str = zlib_decompress_to_string(blob) self.assertEquals(json_str, got_str)
def execute_sql( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing( 'sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() def handle_error(msg): """Local method handling error while processing the SQL""" troubleshooting_link = config['TROUBLESHOOTING_LINK'] query.error_message = msg query.status = QueryStatus.FAILED query.tmp_table_name = None session.commit() payload.update({ 'status': query.status, 'error': msg, }) if troubleshooting_link: payload['link'] = troubleshooting_link return payload if store_results and not results_backend: return handle_error("Results backend isn't configured.") # Limit enforced only for retrieving the data, not for the CTA queries. superset_query = SupersetQuery(rendered_query) executed_sql = superset_query.stripped() SQL_MAX_ROWS = app.config.get('SQL_MAX_ROW') if not superset_query.is_readonly() and not database.allow_dml: return handle_error( 'Only `SELECT` statements are allowed against this database') if query.select_as_cta: if not superset_query.is_select(): return handle_error( 'Only `SELECT` statements can be used with the CREATE TABLE ' 'feature.') if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = 'tmp_{}_table_{}'.format( query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S')) executed_sql = superset_query.as_create_table(query.tmp_table_name) query.select_as_cta_used = True if superset_query.is_select(): if SQL_MAX_ROWS and (not query.limit or query.limit > SQL_MAX_ROWS): query.limit = SQL_MAX_ROWS if query.limit: executed_sql = database.apply_limit_to_sql(executed_sql, query.limit) # Hook to allow environment-specific mutation (usually comments) to the SQL SQL_QUERY_MUTATOR = config.get('SQL_QUERY_MUTATOR') if SQL_QUERY_MUTATOR: executed_sql = SQL_QUERY_MUTATOR( executed_sql, user_name, security_manager, database) query.executed_sql = executed_sql query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.merge(query) session.commit() logging.info("Set query to 'running'") conn = None try: engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, ) conn = engine.raw_connection() cursor = conn.cursor() logging.info('Running query: \n{}'.format(executed_sql)) logging.info(query.executed_sql) query_start_time = now_as_float() db_engine_spec.execute(cursor, query.executed_sql, async_=True) logging.info('Handling cursor') db_engine_spec.handle_cursor(cursor, query, session) logging.info('Fetching data: {}'.format(query.to_dict())) stats_logger.timing( 'sqllab.query.time_executing_query', now_as_float() - query_start_time) fetching_start_time = now_as_float() data = db_engine_spec.fetch_data(cursor, query.limit) stats_logger.timing( 'sqllab.query.time_fetching_results', now_as_float() - fetching_start_time) except SoftTimeLimitExceeded as e: logging.exception(e) if conn is not None: conn.close() return handle_error( "SQL Lab timeout. This environment's policy is to kill queries " 'after {} seconds.'.format(SQLLAB_TIMEOUT)) except Exception as e: logging.exception(e) if conn is not None: conn.close() return handle_error(db_engine_spec.extract_error_message(e)) logging.info('Fetching cursor description') cursor_description = cursor.description if conn is not None: conn.commit() conn.close() if query.status == QueryStatus.STOPPED: return handle_error('The query has been stopped') cdf = dataframe.SupersetDataFrame(data, cursor_description, db_engine_spec) query.rows = cdf.size query.progress = 100 query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = '{}'.format( database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False)) query.end_time = now_as_float() session.merge(query) session.flush() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = '{}'.format(uuid.uuid4()) logging.info('Storing results in results backend, key: {}'.format(key)) write_to_results_backend_start = now_as_float() json_payload = json.dumps( payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key stats_logger.timing( 'sqllab.query.results_backend_write', now_as_float() - write_to_results_backend_start) session.merge(query) session.commit() if return_results: return payload