Ejemplo n.º 1
0
def query_execution_to_es(query_execution, data_cell=None, session=None):
    """data_cell is added as a parameter so that bulk insert of query executions won't require
    re-retrieval of data_cell"""
    query_execution_id = query_execution.id

    engine_id = query_execution.engine_id
    engine = get_query_engine_by_id(engine_id, session=session)

    table_names, _ = process_query(query_execution.query,
                                   language=(engine and engine.language))
    table_names = list(chain.from_iterable(table_names))

    duration = (DATETIME_TO_UTC(query_execution.completed_at) -
                DATETIME_TO_UTC(query_execution.created_at)
                if query_execution.completed_at is not None else None)

    environments = engine.environments
    environment_ids = [env.id for env in environments]

    title = data_cell.meta.get("title", "Untitled") if data_cell else None

    expand_query_execution = {
        "id": query_execution_id,
        "query_type": "query_execution",
        "title": title,
        "environment_id": environment_ids,
        "author_uid": query_execution.uid,
        "engine_id": engine_id,
        "statement_type": get_table_statement_type(query_execution.query),
        "created_at": DATETIME_TO_UTC(query_execution.created_at),
        "duration": duration,
        "full_table_name": table_names,
        "query_text": query_execution.query,
    }
    return expand_query_execution
Ejemplo n.º 2
0
def query_cell_to_es(query_cell, session=None):
    query_cell_id = query_cell.id
    query_cell_meta = query_cell.meta

    engine_id = query_cell_meta.get("engine")
    engine = get_query_engine_by_id(engine_id, session=session)

    query = query_cell.context
    table_names, _ = process_query(query,
                                   language=(engine and engine.language))
    table_names = list(chain.from_iterable(table_names))

    datadoc = query_cell.doc

    expand_query = {
        "id": query_cell_id,
        "query_type": "query_cell",
        "title": query_cell_meta.get("title", "Untitled"),
        "data_doc_id": datadoc and datadoc.id,
        "environment_id": datadoc and datadoc.environment_id,
        "author_uid": datadoc and datadoc.owner_uid,
        "engine_id": engine_id,
        "statement_type": get_table_statement_type(query),
        "created_at": DATETIME_TO_UTC(query_cell.created_at),
        "full_table_name": table_names,
        "query_text": query,
    }
    return expand_query
Ejemplo n.º 3
0
 def _get_executor_and_params_by_engine_id(cls,
                                           engine_id: int,
                                           session=None):
     engine = get_query_engine_by_id(engine_id, session=session)
     executor_params = engine.get_engine_params()
     executor = get_executor_class(engine.language, engine.executor)
     return executor, executor_params, engine.to_dict_admin()
Ejemplo n.º 4
0
    def get_metastore():
        """Lazily initialize metastore_loader from DB.
           Use outer-scope variable to memoized initialization

        Raises:
            LatestPartitionException: If the metastore does not exist for engine_id, throw error

        Returns:
           BaseMetastoreLoader: metastore loader to fetch table/schema info
        """
        nonlocal _metastore_loader
        if _metastore_loader is not None:
            return _metastore_loader

        with DBSession() as session:
            engine = admin_logic.get_query_engine_by_id(engine_id,
                                                        session=session)
            metastore_id = engine.metastore_id if engine else None
            _metastore_loader = (metastore.get_metastore_loader(
                metastore_id, session=session)
                                 if metastore_id is not None else None)

            if _metastore_loader is None:
                raise LatestPartitionException(
                    f"Unable to load metastore for engine id {engine_id}")

        return _metastore_loader
Ejemplo n.º 5
0
def _assert_safe_query(query, engine_id, session=None):
    try:
        from lib.metastore.utils import MetastoreTableACLChecker

        table_per_statement, _ = process_query(query)
        all_tables = [
            table for tables in table_per_statement for table in tables
        ]

        query_engine = admin_logic.get_query_engine_by_id(engine_id,
                                                          session=session)
        if query_engine.metastore_id is None:
            LOG.debug("No metastore for query engine, skipping")
            return

        metastore = admin_logic.get_query_metastore_by_id(
            query_engine.metastore_id, session=session)
        acl_checker = MetastoreTableACLChecker(metastore.acl_control)

        for table in all_tables:
            schema_name, table_name = table.split(".")
            if not acl_checker.is_table_valid(schema_name, table_name):
                raise InvalidQueryExecution(
                    f"Table {table} is not allowed by metastore")
    except InvalidQueryExecution as e:
        raise e
    except Exception as e:
        LOG.info(e)
Ejemplo n.º 6
0
    def destination_s3_root(self, session=None) -> str:
        """Generate the bucket name + prefix before
           the table specific folder

        Returns:
            str: s3 path consisting bucket + prefix + schema name
        """
        if "s3_path" in self._exporter_config:
            schema_name, _ = self._fq_table_name
            s3_path: str = self._exporter_config["s3_path"]

            return sanitize_s3_url_with_trailing_slash(
                s3_path) + schema_name + "/"

        if self._exporter_config.get("use_schema_location", False):
            # Defer import since this is only needed for this option
            from lib.metastore.loaders.hive_metastore_loader import HMSMetastoreLoader

            query_engine = get_query_engine_by_id(self._engine_id,
                                                  session=session)
            metastore: HMSMetastoreLoader = get_metastore_loader(
                query_engine.metastore_id, session=session)
            if metastore is None or not isinstance(metastore,
                                                   HMSMetastoreLoader):
                raise Exception(
                    "Invalid metastore to use use_schema_location option")
            schema_location_uri = metastore.hmc.get_database(
                self._table_config["schema_name"]).locationUri

            return sanitize_s3_url_with_trailing_slash(schema_location_uri)

        raise Exception("Must specify s3_path or set use_schema_location=True")
    def _perform_check(cls, engine_id: int) -> EngineStatus:
        with DBSession() as session:
            engine = get_query_engine_by_id(engine_id, session=session)
            executor_params = engine.get_engine_params()

            return check_connection(
                get_executor_class(engine.language, engine.executor),
                executor_params)
Ejemplo n.º 8
0
    def _get_metastore_loader(self, session=None):
        engine = get_query_engine_by_id(self._engine_id, session=session)
        metastore_id = engine.metastore_id
        if metastore_id is None:
            return None

        loader = get_metastore_loader(metastore_id, session=session)
        return loader
Ejemplo n.º 9
0
def get_query_engine_status(engine_id):
    engine_checker = None
    # Security check
    with DBSession() as session:
        verify_query_engine_permission(engine_id, session=session)
        engine = admin_logic.get_query_engine_by_id(engine_id, session=session)
        engine_checker = get_engine_checker_class(
            engine.get_feature_params().get("status_checker", "NullChecker"))

    api_assert(engine_checker is not None, "Invalid engine checker")
    return engine_checker.check(engine_id=engine_id, uid=current_user.id)
Ejemplo n.º 10
0
 def _get_table_create_query(self, session=None) -> str:
     query_engine = get_query_engine_by_id(self._engine_id, session=session)
     schema_name, table_name = self._fq_table_name
     return get_external_create_table_statement(
         query_engine.language,
         table_name,
         self._table_config["column_name_types"],
         self.destination_s3_folder(),
         schema_name,
         self.UPLOAD_FILE_TYPE(),
     )
Ejemplo n.º 11
0
    def _get_sqlalchemy_connection(self, session=None):
        engine = get_query_engine_by_id(self._engine_id, session=session)
        executor = get_executor_class(engine.language, engine.executor)
        executor_params = engine.get_engine_params()
        client = executor._get_client(executor_params)

        if not isinstance(client, SqlAlchemyClient):
            raise ValueError(
                f"Client instance {client} is not SqlAlchemy Based")

        conn = client._engine.connect()
        return conn
Ejemplo n.º 12
0
 def _get_table_create_query(self, session=None) -> str:
     query_engine = get_query_engine_by_id(self._engine_id, session=session)
     schema_name, table_name = self._fq_table_name
     is_external = not self._exporter_config.get("use_schema_location",
                                                 False)
     return get_create_table_statement(
         language=query_engine.language,
         table_name=table_name,
         schema_name=schema_name,
         column_name_types=self._table_config["column_name_types"],
         # if use schema location, then no table location is needed for creation
         file_location=self.destination_s3_folder()
         if is_external else None,
         file_format=self.UPLOAD_FILE_TYPE(),
         table_properties=self._exporter_config.get("table_properties", []),
     )
Ejemplo n.º 13
0
def query_execution_to_es(query_execution,
                          data_cell=None,
                          fields=None,
                          session=None):
    """data_cell is added as a parameter so that bulk insert of query executions won't require
    re-retrieval of data_cell"""
    engine_id = query_execution.engine_id
    engine = admin_logic.get_query_engine_by_id(engine_id, session=session)
    datadoc = data_cell.doc if data_cell else None

    def get_duration():
        return (DATETIME_TO_UTC(query_execution.completed_at) -
                DATETIME_TO_UTC(query_execution.created_at)
                if query_execution.completed_at is not None else None)

    field_to_getter = {
        "id":
        query_execution.id,
        "query_type":
        "query_execution",
        "title":
        data_cell.meta.get("title", "Untitled") if data_cell else None,
        "environment_id": [env.id for env in engine.environments],
        "author_uid":
        query_execution.uid,
        "engine_id":
        engine_id,
        "statement_type":
        lambda: get_table_statement_type(query_execution.query),
        "created_at":
        lambda: DATETIME_TO_UTC(query_execution.created_at),
        "duration":
        get_duration,
        "full_table_name":
        lambda: _get_table_names_from_query(
            query_execution.query, language=(engine and engine.language)),
        "query_text":
        query_execution.query,
        "public":
        datadoc is None or datadoc.public,
        "readable_user_ids":
        lambda: _get_datadoc_editors(datadoc, session=session),
    }

    return _get_dict_by_field(field_to_getter, fields=fields)
Ejemplo n.º 14
0
def get_table_upload_exporter(engine_id,
                              session=None) -> BaseTableUploadExporter:
    query_engine = get_query_engine_by_id(engine_id, session=session)
    if not query_engine:
        raise Exception(f"Invalid query engine id {engine_id}")
    feature_params = query_engine.feature_params

    if "upload_exporter" not in feature_params:
        raise Exception(
            f"Query engine {query_engine.name} does not have a exporter")

    upload_exporter_name = feature_params["upload_exporter"]
    if upload_exporter_name not in ALL_TABLE_UPLOAD_EXPORTER_BY_NAME:
        raise Exception(
            f"Invalid table exporter configure {upload_exporter_name}")

    exporter: BaseTableUploadExporter = ALL_TABLE_UPLOAD_EXPORTER_BY_NAME[
        upload_exporter_name]
    return exporter
Ejemplo n.º 15
0
    def __call__(
        self, query: str, engine_id: int, uid: int = None, session=None,
    ):
        """Start the query execution progress. If async then
           it just sets up the necessary variables, if sync
           then actually execute the query

        Args:
            query (str): Query getting executed
            engine_id (int): The id of the engine
            uid (int, optional): User id for proxy user. Defaults to None.
            session (SqlAlchemySession, optional): for querying database

        Returns:
            Any[][]: Returns the result if sync, otherwise None
        """
        engine = get_query_engine_by_id(engine_id, session=session)

        client_settings = {
            **engine.get_engine_params(),
        }
        if uid:
            user = get_user_by_id(uid, session=session)
            client_settings["proxy_user"] = user.username

        executor = get_executor_class(engine.language, engine.executor)

        if executor.SINGLE_QUERY_QUERY_ENGINE():
            statements = [query]
        else:
            statements = get_statements(query)

        if len(statements) == 0:
            return None  # Empty statement, return None

        cursor = executor._get_client(client_settings).cursor()
        if self._async:
            self._async_run(cursor, statements)
            return None
        else:
            return self._sync_run(cursor, statements)
Ejemplo n.º 16
0
def query_cell_to_es(query_cell, fields=None, session=None):
    query_cell_meta = query_cell.meta
    query = query_cell.context
    datadoc = query_cell.doc

    engine_id = query_cell_meta.get("engine")
    engine = admin_logic.get_query_engine_by_id(engine_id, session=session)

    field_to_getter = {
        "id":
        query_cell.id,
        "query_type":
        "query_cell",
        "title":
        query_cell_meta.get("title", "Untitled"),
        "data_doc_id":
        datadoc and datadoc.id,
        "environment_id":
        datadoc and datadoc.environment_id,
        "author_uid":
        datadoc and datadoc.owner_uid,
        "engine_id":
        engine_id,
        "statement_type":
        lambda: get_table_statement_type(query),
        "created_at":
        lambda: DATETIME_TO_UTC(query_cell.created_at),
        "full_table_name":
        lambda: _get_table_names_from_query(
            query, language=(engine and engine.language)),
        "query_text":
        query,
        "public":
        datadoc is not None and datadoc.public,
        "readable_user_ids":
        lambda: _get_datadoc_editors(datadoc, session=session),
    }

    return _get_dict_by_field(field_to_getter, fields=fields)
Ejemplo n.º 17
0
def _get_executor_params_and_engine(query_execution_id,
                                    celery_task,
                                    session=None):
    query, statement_ranges, uid, engine_id = _get_query_execution_info(
        query_execution_id, session=session)
    user = user_logic.get_user_by_id(uid, session=session)
    engine = admin_logic.get_query_engine_by_id(engine_id, session=session)
    if engine.deleted_at is not None:
        raise ArchivedQueryEngine("This query engine is disabled.")

    return (
        {
            "query_execution_id": query_execution_id,
            "celery_task": celery_task,
            "query": query,
            "statement_ranges": statement_ranges,
            "client_setting": {
                **engine.get_engine_params(),
                "proxy_user": user.email,
            },
        },
        engine,
    )
Ejemplo n.º 18
0
    def __call__(
        self,
        query: str,
        engine_id: int,
        uid: int = None,
        session=None,
    ):
        """Start the query execution progress. If async then
           it just sets up the necessary variables, if sync
           then actually execute the query

        Args:
            query (str): Query getting executed
            engine_id (int): The id of the engine
            uid (int, optional): User id for proxy user. Defaults to None.
            session (SqlAlchemySession, optional): for querying database

        Returns:
            Any[][]: Returns the result if sync, otherwise None
        """
        engine = get_query_engine_by_id(engine_id, session=session)
        client_settings = get_client_setting_from_engine(engine,
                                                         uid,
                                                         session=session)
        self.executor = get_executor_class(engine.language, engine.executor)

        statements = parse_statement_from_query(self.executor, query)
        if len(statements) == 0:
            # Empty statement, return None
            return None

        cursor = self.executor._get_client(client_settings).cursor()
        if self._async:
            self._async_run(cursor, statements)
            return None
        else:
            return self._sync_run(cursor, statements)
Ejemplo n.º 19
0
def _get_executor_params_and_engine(query_execution_id,
                                    celery_task,
                                    session=None):
    query, statement_ranges, uid, engine_id = _get_query_execution_info(
        query_execution_id, session=session)

    engine = admin_logic.get_query_engine_by_id(engine_id, session=session)
    if engine.deleted_at is not None:
        raise ArchivedQueryEngine("This query engine is disabled.")

    client_setting = get_client_setting_from_engine(engine,
                                                    uid,
                                                    session=session)

    return (
        {
            "query_execution_id": query_execution_id,
            "celery_task": celery_task,
            "query": query,
            "statement_ranges": statement_ranges,
            "client_setting": client_setting,
        },
        engine,
    )