def query_execution_to_es(query_execution, data_cell=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" query_execution_id = query_execution.id engine_id = query_execution.engine_id engine = get_query_engine_by_id(engine_id, session=session) table_names, _ = process_query(query_execution.query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) duration = (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) environments = engine.environments environment_ids = [env.id for env in environments] title = data_cell.meta.get("title", "Untitled") if data_cell else None expand_query_execution = { "id": query_execution_id, "query_type": "query_execution", "title": title, "environment_id": environment_ids, "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query_execution.query), "created_at": DATETIME_TO_UTC(query_execution.created_at), "duration": duration, "full_table_name": table_names, "query_text": query_execution.query, } return expand_query_execution
def query_cell_to_es(query_cell, session=None): query_cell_id = query_cell.id query_cell_meta = query_cell.meta engine_id = query_cell_meta.get("engine") engine = get_query_engine_by_id(engine_id, session=session) query = query_cell.context table_names, _ = process_query(query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) datadoc = query_cell.doc expand_query = { "id": query_cell_id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query), "created_at": DATETIME_TO_UTC(query_cell.created_at), "full_table_name": table_names, "query_text": query, } return expand_query
def _get_executor_and_params_by_engine_id(cls, engine_id: int, session=None): engine = get_query_engine_by_id(engine_id, session=session) executor_params = engine.get_engine_params() executor = get_executor_class(engine.language, engine.executor) return executor, executor_params, engine.to_dict_admin()
def get_metastore(): """Lazily initialize metastore_loader from DB. Use outer-scope variable to memoized initialization Raises: LatestPartitionException: If the metastore does not exist for engine_id, throw error Returns: BaseMetastoreLoader: metastore loader to fetch table/schema info """ nonlocal _metastore_loader if _metastore_loader is not None: return _metastore_loader with DBSession() as session: engine = admin_logic.get_query_engine_by_id(engine_id, session=session) metastore_id = engine.metastore_id if engine else None _metastore_loader = (metastore.get_metastore_loader( metastore_id, session=session) if metastore_id is not None else None) if _metastore_loader is None: raise LatestPartitionException( f"Unable to load metastore for engine id {engine_id}") return _metastore_loader
def _assert_safe_query(query, engine_id, session=None): try: from lib.metastore.utils import MetastoreTableACLChecker table_per_statement, _ = process_query(query) all_tables = [ table for tables in table_per_statement for table in tables ] query_engine = admin_logic.get_query_engine_by_id(engine_id, session=session) if query_engine.metastore_id is None: LOG.debug("No metastore for query engine, skipping") return metastore = admin_logic.get_query_metastore_by_id( query_engine.metastore_id, session=session) acl_checker = MetastoreTableACLChecker(metastore.acl_control) for table in all_tables: schema_name, table_name = table.split(".") if not acl_checker.is_table_valid(schema_name, table_name): raise InvalidQueryExecution( f"Table {table} is not allowed by metastore") except InvalidQueryExecution as e: raise e except Exception as e: LOG.info(e)
def destination_s3_root(self, session=None) -> str: """Generate the bucket name + prefix before the table specific folder Returns: str: s3 path consisting bucket + prefix + schema name """ if "s3_path" in self._exporter_config: schema_name, _ = self._fq_table_name s3_path: str = self._exporter_config["s3_path"] return sanitize_s3_url_with_trailing_slash( s3_path) + schema_name + "/" if self._exporter_config.get("use_schema_location", False): # Defer import since this is only needed for this option from lib.metastore.loaders.hive_metastore_loader import HMSMetastoreLoader query_engine = get_query_engine_by_id(self._engine_id, session=session) metastore: HMSMetastoreLoader = get_metastore_loader( query_engine.metastore_id, session=session) if metastore is None or not isinstance(metastore, HMSMetastoreLoader): raise Exception( "Invalid metastore to use use_schema_location option") schema_location_uri = metastore.hmc.get_database( self._table_config["schema_name"]).locationUri return sanitize_s3_url_with_trailing_slash(schema_location_uri) raise Exception("Must specify s3_path or set use_schema_location=True")
def _perform_check(cls, engine_id: int) -> EngineStatus: with DBSession() as session: engine = get_query_engine_by_id(engine_id, session=session) executor_params = engine.get_engine_params() return check_connection( get_executor_class(engine.language, engine.executor), executor_params)
def _get_metastore_loader(self, session=None): engine = get_query_engine_by_id(self._engine_id, session=session) metastore_id = engine.metastore_id if metastore_id is None: return None loader = get_metastore_loader(metastore_id, session=session) return loader
def get_query_engine_status(engine_id): engine_checker = None # Security check with DBSession() as session: verify_query_engine_permission(engine_id, session=session) engine = admin_logic.get_query_engine_by_id(engine_id, session=session) engine_checker = get_engine_checker_class( engine.get_feature_params().get("status_checker", "NullChecker")) api_assert(engine_checker is not None, "Invalid engine checker") return engine_checker.check(engine_id=engine_id, uid=current_user.id)
def _get_table_create_query(self, session=None) -> str: query_engine = get_query_engine_by_id(self._engine_id, session=session) schema_name, table_name = self._fq_table_name return get_external_create_table_statement( query_engine.language, table_name, self._table_config["column_name_types"], self.destination_s3_folder(), schema_name, self.UPLOAD_FILE_TYPE(), )
def _get_sqlalchemy_connection(self, session=None): engine = get_query_engine_by_id(self._engine_id, session=session) executor = get_executor_class(engine.language, engine.executor) executor_params = engine.get_engine_params() client = executor._get_client(executor_params) if not isinstance(client, SqlAlchemyClient): raise ValueError( f"Client instance {client} is not SqlAlchemy Based") conn = client._engine.connect() return conn
def _get_table_create_query(self, session=None) -> str: query_engine = get_query_engine_by_id(self._engine_id, session=session) schema_name, table_name = self._fq_table_name is_external = not self._exporter_config.get("use_schema_location", False) return get_create_table_statement( language=query_engine.language, table_name=table_name, schema_name=schema_name, column_name_types=self._table_config["column_name_types"], # if use schema location, then no table location is needed for creation file_location=self.destination_s3_folder() if is_external else None, file_format=self.UPLOAD_FILE_TYPE(), table_properties=self._exporter_config.get("table_properties", []), )
def query_execution_to_es(query_execution, data_cell=None, fields=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" engine_id = query_execution.engine_id engine = admin_logic.get_query_engine_by_id(engine_id, session=session) datadoc = data_cell.doc if data_cell else None def get_duration(): return (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) field_to_getter = { "id": query_execution.id, "query_type": "query_execution", "title": data_cell.meta.get("title", "Untitled") if data_cell else None, "environment_id": [env.id for env in engine.environments], "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": lambda: get_table_statement_type(query_execution.query), "created_at": lambda: DATETIME_TO_UTC(query_execution.created_at), "duration": get_duration, "full_table_name": lambda: _get_table_names_from_query( query_execution.query, language=(engine and engine.language)), "query_text": query_execution.query, "public": datadoc is None or datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def get_table_upload_exporter(engine_id, session=None) -> BaseTableUploadExporter: query_engine = get_query_engine_by_id(engine_id, session=session) if not query_engine: raise Exception(f"Invalid query engine id {engine_id}") feature_params = query_engine.feature_params if "upload_exporter" not in feature_params: raise Exception( f"Query engine {query_engine.name} does not have a exporter") upload_exporter_name = feature_params["upload_exporter"] if upload_exporter_name not in ALL_TABLE_UPLOAD_EXPORTER_BY_NAME: raise Exception( f"Invalid table exporter configure {upload_exporter_name}") exporter: BaseTableUploadExporter = ALL_TABLE_UPLOAD_EXPORTER_BY_NAME[ upload_exporter_name] return exporter
def __call__( self, query: str, engine_id: int, uid: int = None, session=None, ): """Start the query execution progress. If async then it just sets up the necessary variables, if sync then actually execute the query Args: query (str): Query getting executed engine_id (int): The id of the engine uid (int, optional): User id for proxy user. Defaults to None. session (SqlAlchemySession, optional): for querying database Returns: Any[][]: Returns the result if sync, otherwise None """ engine = get_query_engine_by_id(engine_id, session=session) client_settings = { **engine.get_engine_params(), } if uid: user = get_user_by_id(uid, session=session) client_settings["proxy_user"] = user.username executor = get_executor_class(engine.language, engine.executor) if executor.SINGLE_QUERY_QUERY_ENGINE(): statements = [query] else: statements = get_statements(query) if len(statements) == 0: return None # Empty statement, return None cursor = executor._get_client(client_settings).cursor() if self._async: self._async_run(cursor, statements) return None else: return self._sync_run(cursor, statements)
def query_cell_to_es(query_cell, fields=None, session=None): query_cell_meta = query_cell.meta query = query_cell.context datadoc = query_cell.doc engine_id = query_cell_meta.get("engine") engine = admin_logic.get_query_engine_by_id(engine_id, session=session) field_to_getter = { "id": query_cell.id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": lambda: get_table_statement_type(query), "created_at": lambda: DATETIME_TO_UTC(query_cell.created_at), "full_table_name": lambda: _get_table_names_from_query( query, language=(engine and engine.language)), "query_text": query, "public": datadoc is not None and datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def _get_executor_params_and_engine(query_execution_id, celery_task, session=None): query, statement_ranges, uid, engine_id = _get_query_execution_info( query_execution_id, session=session) user = user_logic.get_user_by_id(uid, session=session) engine = admin_logic.get_query_engine_by_id(engine_id, session=session) if engine.deleted_at is not None: raise ArchivedQueryEngine("This query engine is disabled.") return ( { "query_execution_id": query_execution_id, "celery_task": celery_task, "query": query, "statement_ranges": statement_ranges, "client_setting": { **engine.get_engine_params(), "proxy_user": user.email, }, }, engine, )
def __call__( self, query: str, engine_id: int, uid: int = None, session=None, ): """Start the query execution progress. If async then it just sets up the necessary variables, if sync then actually execute the query Args: query (str): Query getting executed engine_id (int): The id of the engine uid (int, optional): User id for proxy user. Defaults to None. session (SqlAlchemySession, optional): for querying database Returns: Any[][]: Returns the result if sync, otherwise None """ engine = get_query_engine_by_id(engine_id, session=session) client_settings = get_client_setting_from_engine(engine, uid, session=session) self.executor = get_executor_class(engine.language, engine.executor) statements = parse_statement_from_query(self.executor, query) if len(statements) == 0: # Empty statement, return None return None cursor = self.executor._get_client(client_settings).cursor() if self._async: self._async_run(cursor, statements) return None else: return self._sync_run(cursor, statements)
def _get_executor_params_and_engine(query_execution_id, celery_task, session=None): query, statement_ranges, uid, engine_id = _get_query_execution_info( query_execution_id, session=session) engine = admin_logic.get_query_engine_by_id(engine_id, session=session) if engine.deleted_at is not None: raise ArchivedQueryEngine("This query engine is disabled.") client_setting = get_client_setting_from_engine(engine, uid, session=session) return ( { "query_execution_id": query_execution_id, "celery_task": celery_task, "query": query, "statement_ranges": statement_ranges, "client_setting": client_setting, }, engine, )