def validate(cls, sql: str, schema: str, database: Any) -> List[SQLValidationAnnotation]: """ Presto supports query-validation queries by running them with a prepended explain. For example, "SELECT 1 FROM default.mytable" becomes "EXPLAIN (TYPE VALIDATE) SELECT 1 FROM default.mytable. """ user_name = g.user.username if g.user else None parsed_query = ParsedQuery(sql) statements = parsed_query.get_statements() logger.info(f"Validating {len(statements)} statement(s)") engine = database.get_sqla_engine( schema=schema, nullpool=True, user_name=user_name, source=sources.get("sql_lab", None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) annotations: List[SQLValidationAnnotation] = [] with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: for statement in parsed_query.get_statements(): annotation = cls.validate_statement( statement, database, cursor, user_name) if annotation: annotations.append(annotation) logger.debug(f"Validation found {len(annotations)} error(s)") return annotations
def cancel_query(query, user_name): if not query.connection_id: return database = query.database engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get('sql_lab', None), ) db_engine_spec = database.db_engine_spec with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: db_engine_spec.cancel_query(cursor, query)
def cancel_query(query, user_name): logging.info( "Query with id `%s` has connection id `%s`", query.id, query.connection_id ) if not query.connection_id: logging.info("No connection id found, query cancellation skipped") return database = query.database engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get("sql_lab", None), ) db_engine_spec = database.db_engine_spec with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: logging.info("Calling `cancel_query` on db engine") db_engine_spec.cancel_query(cursor, query)
def execute_sql_statements( query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, expand_data=False, ): # pylint: disable=too-many-arguments, too-many-locals, too-many-statements """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing("sqllab.query.time_pending", now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if database.allow_run_async and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logger.info(f"Query {query_id}: Executing {len(statements)} statement(s)") logger.info(f"Query {query_id}: Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.commit() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get("sql_lab", None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # Check if stopped query = get_query(query_id, session) if query.status == QueryStatus.STOPPED: return None # Run statement msg = f"Running statement {i+1} out of {statement_count}" logger.info(f"Query {query_id}: {msg}") query.set_extra_json_key("progress", msg) session.commit() try: cdf = execute_sql_statement(statement, query, user_name, session, cursor) except Exception as e: # pylint: disable=broad-except msg = str(e) if statement_count > 1: msg = f"[Statement {i+1} out of {statement_count}] " + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key("progress", None) if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False, ) query.end_time = now_as_float() data, selected_columns, all_columns, expanded_columns = _serialize_and_expand_data( cdf, db_engine_spec, store_results and results_backend_use_msgpack, expand_data) payload.update({ "status": QueryStatus.SUCCESS, "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query.to_dict(), }) payload["query"]["state"] = QueryStatus.SUCCESS if store_results and results_backend: key = str(uuid.uuid4()) logger.info( f"Query {query_id}: Storing results in results backend, key: {key}" ) with stats_timing("sqllab.query.results_backend_write", stats_logger): with stats_timing( "sqllab.query.results_backend_write_serialization", stats_logger): serialized_payload = _serialize_payload( payload, results_backend_use_msgpack) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config["CACHE_DEFAULT_TIMEOUT"] compressed = zlib_compress(serialized_payload) logger.debug( f"*** serialized payload size: {getsizeof(serialized_payload)}" ) logger.debug( f"*** compressed payload size: {getsizeof(compressed)}") results_backend.set(key, compressed, cache_timeout) query.results_key = key query.status = QueryStatus.SUCCESS session.commit() if return_results: return payload return None
def execute_sql_statements( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing("sqllab.query.time_pending", now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if store_results and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logging.info(f"Executing {len(statements)} statement(s)") logging.info("Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get("sql_lab", None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: query.connection_id = db_engine_spec.get_connection_id(cursor) session.commit() statement_count = len(statements) for i, statement in enumerate(statements): # check if the query was stopped session.refresh(query) if query.status == QueryStatus.STOPPED: payload.update({"status": query.status}) return payload msg = f"Running statement {i+1} out of {statement_count}" logging.info(msg) query.set_extra_json_key("progress", msg) session.commit() try: cdf = execute_sql_statement( statement, query, user_name, session, cursor ) msg = f"Running statement {i+1} out of {statement_count}" except Exception as e: # query can be stopped in another thread/worker # but in synchronized mode it may lead to an error # skip error the error in such case session.refresh(query) if query.status == QueryStatus.STOPPED: payload.update({"status": query.status}) return payload msg = str(e) if statement_count > 1: msg = f"[Statement {i+1} out of {statement_count}] " + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key("progress", None) query.connection_id = None if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False, ) query.end_time = now_as_float() selected_columns = cdf.columns or [] data = cdf.data or [] all_columns, data, expanded_columns = db_engine_spec.expand_data( selected_columns, data ) payload.update( { "status": QueryStatus.SUCCESS, "data": data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query.to_dict(), } ) payload["query"]["state"] = QueryStatus.SUCCESS # go over each row, find bytes columns that start with the magic UAST # sequence b'\x00bgr', and replace it with a string containing the # UAST in JSON for row in payload["data"]: for k, v in row.items(): if isinstance(v, bytes) and len(v) > 4 and v[0:4] == b"\x00bgr": try: ctx = decode(v, format=0) row[k] = json.dumps(ctx.load()) except Exception: pass if store_results: key = str(uuid.uuid4()) logging.info(f"Storing results in results backend, key: {key}") with stats_timing("sqllab.query.results_backend_write", stats_logger): json_payload = json.dumps( payload, default=json_iso_dttm_ser, ignore_nan=True ) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get("CACHE_DEFAULT_TIMEOUT", 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key query.status = QueryStatus.SUCCESS session.commit() if return_results: return payload
def execute_sql_statements( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing('sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if store_results and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logging.info(f'Executing {len(statements)} statement(s)') logging.info("Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get('sql_lab', None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # TODO CHECK IF STOPPED msg = f'Running statement {i+1} out of {statement_count}' logging.info(msg) query.set_extra_json_key('progress', msg) session.commit() try: cdf = execute_sql_statement(statement, query, user_name, session, cursor) msg = f'Running statement {i+1} out of {statement_count}' except Exception as e: msg = str(e) if statement_count > 1: msg = f'[Statement {i+1} out of {statement_count}] ' + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key('progress', None) query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False) query.end_time = now_as_float() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = str(uuid.uuid4()) logging.info(f'Storing results in results backend, key: {key}') with stats_timing('sqllab.query.results_backend_write', stats_logger): json_payload = json.dumps(payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key session.commit() if return_results: return payload
def execute_sql_statements( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing( 'sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() if store_results and not results_backend: raise SqlLabException("Results backend isn't configured.") # Breaking down into multiple statements parsed_query = ParsedQuery(rendered_query) statements = parsed_query.get_statements() logging.info(f'Executing {len(statements)} statement(s)') logging.info("Set query to 'running'") query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get('sql_lab', None), ) # Sharing a single connection and cursor across the # execution of all statements (if many) with closing(engine.raw_connection()) as conn: with closing(conn.cursor()) as cursor: statement_count = len(statements) for i, statement in enumerate(statements): # TODO CHECK IF STOPPED msg = f'Running statement {i+1} out of {statement_count}' logging.info(msg) query.set_extra_json_key('progress', msg) session.commit() is_last_statement = i == len(statements) - 1 try: cdf = execute_sql_statement( statement, query, user_name, session, cursor, return_results=is_last_statement and return_results) msg = f'Running statement {i+1} out of {statement_count}' except Exception as e: msg = str(e) if statement_count > 1: msg = f'[Statement {i+1} out of {statement_count}] ' + msg payload = handle_query_error(msg, query, session, payload) return payload # Success, updating the query entry in database query.rows = cdf.size query.progress = 100 query.set_extra_json_key('progress', None) query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False) query.end_time = now_as_float() session.commit() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = str(uuid.uuid4()) logging.info(f'Storing results in results backend, key: {key}') with stats_timing('sqllab.query.results_backend_write', stats_logger): json_payload = json.dumps( payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key session.commit() if return_results: return payload
def execute_sql( ctask, query_id, rendered_query, return_results=True, store_results=False, user_name=None, session=None, start_time=None, ): """Executes the sql query returns the results.""" if store_results and start_time: # only asynchronous queries stats_logger.timing( 'sqllab.query.time_pending', now_as_float() - start_time) query = get_query(query_id, session) payload = dict(query_id=query_id) database = query.database db_engine_spec = database.db_engine_spec db_engine_spec.patch() def handle_error(msg): """Local method handling error while processing the SQL""" troubleshooting_link = config['TROUBLESHOOTING_LINK'] query.error_message = msg query.status = QueryStatus.FAILED query.tmp_table_name = None session.commit() payload.update({ 'status': query.status, 'error': msg, }) if troubleshooting_link: payload['link'] = troubleshooting_link return payload if store_results and not results_backend: return handle_error("Results backend isn't configured.") # Limit enforced only for retrieving the data, not for the CTA queries. superset_query = SupersetQuery(rendered_query) executed_sql = superset_query.stripped() SQL_MAX_ROWS = app.config.get('SQL_MAX_ROW') if not superset_query.is_readonly() and not database.allow_dml: return handle_error( 'Only `SELECT` statements are allowed against this database') if query.select_as_cta: if not superset_query.is_select(): return handle_error( 'Only `SELECT` statements can be used with the CREATE TABLE ' 'feature.') if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = 'tmp_{}_table_{}'.format( query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S')) executed_sql = superset_query.as_create_table(query.tmp_table_name) query.select_as_cta_used = True if superset_query.is_select(): if SQL_MAX_ROWS and (not query.limit or query.limit > SQL_MAX_ROWS): query.limit = SQL_MAX_ROWS if query.limit: executed_sql = database.apply_limit_to_sql(executed_sql, query.limit) # Hook to allow environment-specific mutation (usually comments) to the SQL SQL_QUERY_MUTATOR = config.get('SQL_QUERY_MUTATOR') if SQL_QUERY_MUTATOR: executed_sql = SQL_QUERY_MUTATOR( executed_sql, user_name, security_manager, database) query.executed_sql = executed_sql query.status = QueryStatus.RUNNING query.start_running_time = now_as_float() session.merge(query) session.commit() logging.info("Set query to 'running'") conn = None try: engine = database.get_sqla_engine( schema=query.schema, nullpool=True, user_name=user_name, source=sources.get('sql_lab', None), ) conn = engine.raw_connection() cursor = conn.cursor() logging.info('Running query: \n{}'.format(executed_sql)) logging.info(query.executed_sql) query_start_time = now_as_float() db_engine_spec.execute(cursor, query.executed_sql, async_=True) logging.info('Handling cursor') db_engine_spec.handle_cursor(cursor, query, session) logging.info('Fetching data: {}'.format(query.to_dict())) stats_logger.timing( 'sqllab.query.time_executing_query', now_as_float() - query_start_time) fetching_start_time = now_as_float() data = db_engine_spec.fetch_data(cursor, query.limit) stats_logger.timing( 'sqllab.query.time_fetching_results', now_as_float() - fetching_start_time) except SoftTimeLimitExceeded as e: logging.exception(e) if conn is not None: conn.close() return handle_error( "SQL Lab timeout. This environment's policy is to kill queries " 'after {} seconds.'.format(SQLLAB_TIMEOUT)) except Exception as e: logging.exception(e) if conn is not None: conn.close() return handle_error(db_engine_spec.extract_error_message(e)) logging.info('Fetching cursor description') cursor_description = cursor.description if conn is not None: conn.commit() conn.close() if query.status == QueryStatus.STOPPED: return handle_error('The query has been stopped') logging.debug('Cursor description: %s', cursor_description) cdf = dataframe.SupersetDataFrame(data, cursor_description, db_engine_spec) query.rows = cdf.size query.progress = 100 query.status = QueryStatus.SUCCESS if query.select_as_cta: query.select_sql = '{}'.format( database.select_star( query.tmp_table_name, limit=query.limit, schema=database.force_ctas_schema, show_cols=False, latest_partition=False)) query.end_time = now_as_float() session.merge(query) session.flush() payload.update({ 'status': query.status, 'data': cdf.data if cdf.data else [], 'columns': cdf.columns if cdf.columns else [], 'query': query.to_dict(), }) if store_results: key = '{}'.format(uuid.uuid4()) logging.info('Storing results in results backend, key: {}'.format(key)) write_to_results_backend_start = now_as_float() json_payload = json.dumps( payload, default=json_iso_dttm_ser, ignore_nan=True) cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config.get('CACHE_DEFAULT_TIMEOUT', 0) results_backend.set(key, zlib_compress(json_payload), cache_timeout) query.results_key = key stats_logger.timing( 'sqllab.query.results_backend_write', now_as_float() - write_to_results_backend_start) session.merge(query) session.commit() if return_results: return payload