Esempio n. 1
0
def get_if_schema_and_table_exists(metastore_id, schema_name,
                                   table_name) -> Tuple[bool, bool]:
    """
    Check if the table name / schema name exists in cache, then check the actual metastore
    if they don't exist

    Returns [schema_exists, table_exists]
    """
    verify_metastore_permission(metastore_id)
    with DataTableFinder(metastore_id) as t_finder:
        table_exists_in_cache = t_finder.get_table_by_name(
            schema_name, table_name)
        if table_exists_in_cache:
            return [True, True]

        metastore_loader = get_metastore_loader(metastore_id)
        table_exists = metastore_loader.check_if_table_exists(
            schema_name, table_name)
        if table_exists:
            return [True, True]

        schema_exists_in_cache = t_finder.get_schema_by_name(schema_name)
        if schema_exists_in_cache:
            return [True, False]

        schema_exists = metastore_loader.check_if_schema_exists(schema_name)
        if schema_exists:
            return [True, False]

    return [False, False]
Esempio n. 2
0
    def get_metastore():
        """Lazily initialize metastore_loader from DB.
           Use outer-scope variable to memoized initialization

        Raises:
            LatestPartitionException: If the metastore does not exist for engine_id, throw error

        Returns:
           BaseMetastoreLoader: metastore loader to fetch table/schema info
        """
        nonlocal _metastore_loader
        if _metastore_loader is not None:
            return _metastore_loader

        with DBSession() as session:
            engine = admin_logic.get_query_engine_by_id(engine_id,
                                                        session=session)
            metastore_id = engine.metastore_id if engine else None
            _metastore_loader = (metastore.get_metastore_loader(
                metastore_id, session=session)
                                 if metastore_id is not None else None)

            if _metastore_loader is None:
                raise LatestPartitionException(
                    f"Unable to load metastore for engine id {engine_id}")

        return _metastore_loader
Esempio n. 3
0
    def destination_s3_root(self, session=None) -> str:
        """Generate the bucket name + prefix before
           the table specific folder

        Returns:
            str: s3 path consisting bucket + prefix + schema name
        """
        if "s3_path" in self._exporter_config:
            schema_name, _ = self._fq_table_name
            s3_path: str = self._exporter_config["s3_path"]

            return sanitize_s3_url_with_trailing_slash(
                s3_path) + schema_name + "/"

        if self._exporter_config.get("use_schema_location", False):
            # Defer import since this is only needed for this option
            from lib.metastore.loaders.hive_metastore_loader import HMSMetastoreLoader

            query_engine = get_query_engine_by_id(self._engine_id,
                                                  session=session)
            metastore: HMSMetastoreLoader = get_metastore_loader(
                query_engine.metastore_id, session=session)
            if metastore is None or not isinstance(metastore,
                                                   HMSMetastoreLoader):
                raise Exception(
                    "Invalid metastore to use use_schema_location option")
            schema_location_uri = metastore.hmc.get_database(
                self._table_config["schema_name"]).locationUri

            return sanitize_s3_url_with_trailing_slash(schema_location_uri)

        raise Exception("Must specify s3_path or set use_schema_location=True")
Esempio n. 4
0
    def _get_metastore_loader(self, session=None):
        engine = get_query_engine_by_id(self._engine_id, session=session)
        metastore_id = engine.metastore_id
        if metastore_id is None:
            return None

        loader = get_metastore_loader(metastore_id, session=session)
        return loader
Esempio n. 5
0
def refresh_table_from_metastore(table_id):
    """Refetch table info from metastore"""
    with DBSession() as session:
        verify_data_table_permission(table_id, session=session)

        table = logic.get_table_by_id(table_id, session=session)
        schema = table.data_schema

        metastore_id = schema.metastore_id
        metastore_loader = get_metastore_loader(metastore_id, session=session)
        metastore_loader.sync_create_or_update_table(schema.name,
                                                     table.name,
                                                     session=session)

        session.refresh(table)
        return table
def sync_table_to_metastore(table_per_statement,
                            statement_types,
                            metastore_id,
                            session=None):
    metastore_loader = get_metastore_loader(metastore_id, session=session)
    assert metastore_loader is not None

    tables_to_add = set()
    tables_to_remove = set()
    for tables, statement_type in zip(table_per_statement, statement_types):
        if statement_type == "DROP":
            for table in tables:
                tables_to_add.discard(table)
                tables_to_remove.add(table)
        elif statement_type is not None:  # Any other DML/DDL
            for table in tables:
                tables_to_remove.discard(table)

                # If table is create or alert, we must update metastore
                if table not in tables_to_add:  # This is to minimize the checks
                    if statement_type in ("CREATE", "ALTER"):
                        tables_to_add.add(table)
                    else:
                        # Otherwise for things like insert/select we only update
                        # if it doesn't exist in the metastore
                        schema_name, table_name = table.split(".")
                        query_table = m_logic.get_table_by_name(
                            schema_name,
                            table_name,
                            metastore_id=metastore_id,
                            session=session,
                        )
                        if not query_table:
                            tables_to_add.add(table)

    for table in tables_to_remove:
        schema_name, table_name = table.split(".")
        metastore_loader.sync_delete_table(schema_name,
                                           table_name,
                                           session=session)

    for table in tables_to_add:
        schema_name, table_name = table.split(".")
        metastore_loader.sync_create_or_update_table(schema_name,
                                                     table_name,
                                                     session=session)
def log_table_per_statement(
    table_per_statement,
    statement_types,
    query_execution_id,
    metastore_id,
    cell_id,
    session=None,
):
    metastore_loader = get_metastore_loader(metastore_id, session=session)
    assert metastore_loader is not None

    all_tables = set()
    # Only show example queries of SELECT statements
    for tables, statement_type in zip(table_per_statement, statement_types):
        if statement_type in ("SELECT", "INSERT"):
            all_tables.update(tables)

    for table in all_tables:
        schema_name, table_name = table.split(".")
        query_table = m_logic.get_table_by_name(schema_name,
                                                table_name,
                                                metastore_id=metastore_id,
                                                session=session)

        if query_table:  # Sanity check
            m_logic.delete_old_able_query_execution_log(
                cell_id=cell_id,
                query_execution_id=query_execution_id,
                commit=False,
                session=session,
            )
            m_logic.create_table_query_execution_log(
                table_id=query_table.id,
                cell_id=cell_id,
                query_execution_id=query_execution_id,
                session=session,
            )