コード例 #1
0
 def __init__(self, db_connections, schematizer_client):
     self.reset_cache()
     self.schematizer_client = schematizer_client
     self.schema_tracker = SchemaTracker(
         db_connections
     )
     self._set_pii_identifier()
コード例 #2
0
class SchemaWrapper(object):
    """ This class is a wrapper for interacting with schematizer.

    Args:
        schematizer_client(SchematizerClient object): a client that interacts
        with Schematizer APIs with built-in caching features.
    """

    __metaclass__ = SchemaWrapperSingleton
    _notify_email = "*****@*****.**"

    def __init__(self, db_connections, schematizer_client):
        self.reset_cache()
        self.schematizer_client = schematizer_client
        self.schema_tracker = SchemaTracker(db_connections)
        self._set_pii_identifier()

    @classmethod
    def is_pii_supported(cls):
        try:
            # TODO(DATAPIPE-1509|abrar): Currently we have
            # force_avoid_internal_packages as a means of simulating an absence
            # of a yelp's internal package. And all references
            # of force_avoid_internal_packages have to be removed from
            # RH after we are completely ready for open source.
            if is_avoid_internal_packages_set():
                raise ImportError
            from pii_generator.components.pii_identifier import PIIIdentifier  # NOQA
            return True
        except ImportError:
            return False

    def _set_pii_identifier(self):
        if SchemaWrapper.is_pii_supported():
            from pii_generator.components.pii_identifier import PIIIdentifier  # NOQA
            self.pii_identifier = PIIIdentifier(env_config.pii_yaml_path)
        else:
            self.pii_identifier = None

    def __getitem__(self, table):
        if table not in self.cache:
            log.info("table '{}' is not in the cache".format(table))
            self._fetch_schema_for_table(table)
        return self.cache[table]

    def _fetch_schema_for_table(self, table):
        """The schematizer registers schemas idempotently, so this will either
        create a new schema if one hasn't been created before, or populate
        the cache with the existing schema.
        """
        log.info("fetching schema for table '{}'".format(table))
        show_create_result = self.schema_tracker.get_show_create_statement(
            table)
        self.register_with_schema_store(
            table, new_create_table_stmt=show_create_result.query)

    def register_with_schema_store(self,
                                   table,
                                   new_create_table_stmt,
                                   old_create_table_stmt=None,
                                   alter_table_stmt=None):
        """Register with schema store and populate cache
           with response, one interface for both create and alter
           statements.
        """
        log.info("registering {} with schema store".format(table))
        if env_config.register_dry_run:
            self.cache[table] = self._dry_run_schema
            return
        table_stmt_kwargs = {
            'namespace':
            "{0}.{1}.{2}".format(env_config.namespace, table.cluster_name,
                                 table.database_name),
            'source':
            table.table_name,
            'source_owner_email':
            self._notify_email,
            'contains_pii':
            self.pii_identifier.table_has_pii(
                database_name=table.database_name, table_name=table.table_name)
            if self.pii_identifier else False,
            'new_create_table_stmt':
            new_create_table_stmt
        }
        if old_create_table_stmt:
            table_stmt_kwargs["old_create_table_stmt"] = old_create_table_stmt
        if alter_table_stmt:
            table_stmt_kwargs["alter_table_stmt"] = alter_table_stmt

        log.debug(
            "Calling schematizer_client.register_schema_from_mysql_stmts "
            "with kwargs: {}".format(table_stmt_kwargs))
        resp = self.schematizer_client.register_schema_from_mysql_stmts(
            **table_stmt_kwargs)
        log.debug("Got response of {0} from schematizer for table: {1}".format(
            resp, table.table_name))
        self._populate_schema_cache(table, resp)

    def reset_cache(self):
        self.cache = {}

    def _populate_schema_cache(self, table, resp):
        column_type_map = self.schema_tracker.get_column_type_map(table)
        transformation_map = {
            column_name: column_type
            for column_name, column_type in column_type_map.iteritems()
            if (column_type.startswith('set') or column_type.startswith(
                'timestamp') or column_type.startswith('datetime')
                or column_type.startswith('time'))
        }

        self.cache[table] = SchemaWrapperEntry(
            schema_id=resp.schema_id, transformation_map=transformation_map)

    @property
    def _dry_run_schema(self):
        """A schema wrapper to go with dry run mode."""
        return SchemaWrapperEntry(schema_id=1, transformation_map={})
コード例 #3
0
class SchemaWrapper(object):
    """ This class is a wrapper for interacting with schematizer.

    Args:
        schematizer_client(SchematizerClient object): a client that interacts
        with Schematizer APIs with built-in caching features.
    """

    __metaclass__ = SchemaWrapperSingleton
    _notify_email = "*****@*****.**"

    def __init__(self, db_connections, schematizer_client):
        self.reset_cache()
        self.schematizer_client = schematizer_client
        self.schema_tracker = SchemaTracker(
            db_connections
        )
        self._set_pii_identifier()

    @classmethod
    def is_pii_supported(cls):
        try:
            from pii_generator.components.pii_identifier import PIIIdentifier  # NOQA
            return True
        except ImportError:
            return False

    def _set_pii_identifier(self):
        if SchemaWrapper.is_pii_supported():
            from pii_generator.components.pii_identifier import PIIIdentifier  # NOQA
            self.pii_identifier = PIIIdentifier(env_config.pii_yaml_path)
        else:
            self.pii_identifier = None

    def __getitem__(self, table):
        if table not in self.cache:
            log.info("table '{}' is not in the cache".format(table))
            self._fetch_schema_for_table(table)
        return self.cache[table]

    def _fetch_schema_for_table(self, table):
        """The schematizer registers schemas idempotently, so this will either
        create a new schema if one hasn't been created before, or populate
        the cache with the existing schema.
        """
        log.info("fetching schema for table '{}'".format(table))
        show_create_result = self.schema_tracker.get_show_create_statement(table)
        self.register_with_schema_store(
            table,
            new_create_table_stmt=show_create_result.query
        )

    def register_with_schema_store(
        self,
        table,
        new_create_table_stmt,
        old_create_table_stmt=None,
        alter_table_stmt=None
    ):
        """Register with schema store and populate cache
           with response, one interface for both create and alter
           statements.
        """
        log.info("registering {} with schema store".format(table))
        if env_config.register_dry_run:
            self.cache[table] = self._dry_run_schema
            return
        table_stmt_kwargs = {
            'namespace': "{0}.{1}.{2}".format(
                env_config.namespace,
                table.cluster_name,
                table.database_name
            ),
            'source': table.table_name,
            'source_owner_email': self._notify_email,
            'contains_pii': self.pii_identifier.table_has_pii(
                database_name=table.database_name,
                table_name=table.table_name
            ) if self.pii_identifier else False,
            'new_create_table_stmt': new_create_table_stmt
        }
        if old_create_table_stmt:
            table_stmt_kwargs["old_create_table_stmt"] = old_create_table_stmt
        if alter_table_stmt:
            table_stmt_kwargs["alter_table_stmt"] = alter_table_stmt

        log.debug(
            "Calling schematizer_client.register_schema_from_mysql_stmts "
            "with kwargs: {}".format(table_stmt_kwargs)
        )
        resp = self.schematizer_client.register_schema_from_mysql_stmts(
            **table_stmt_kwargs
        )
        log.debug(
            "Got response of {0} from schematizer for table: {1}".format(resp, table.table_name)
        )
        self._populate_schema_cache(table, resp)

    def reset_cache(self):
        self.cache = {}

    def _populate_schema_cache(self, table, resp):
        column_type_map = self.schema_tracker.get_column_type_map(table)
        transformation_map = {
            column_name: column_type
            for column_name, column_type in column_type_map.iteritems()
            if (
                column_type.startswith('set') or
                column_type.startswith('timestamp') or
                column_type.startswith('datetime') or
                column_type.startswith('time')
            )
        }

        self.cache[table] = SchemaWrapperEntry(
            schema_id=resp.schema_id,
            transformation_map=transformation_map
        )

    @property
    def _dry_run_schema(self):
        """A schema wrapper to go with dry run mode."""
        return SchemaWrapperEntry(
            schema_id=1,
            transformation_map={}
        )
コード例 #4
0
 def __init__(self, *args, **kwargs):
     self.register_dry_run = kwargs.pop('register_dry_run')
     super(SchemaEventHandler, self).__init__(*args, **kwargs)
     self.schema_tracker = SchemaTracker(self.db_connections)
     self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)
コード例 #5
0
class SchemaEventHandler(BaseEventHandler):
    """Process all incoming schema changes
    """

    def __init__(self, *args, **kwargs):
        self.register_dry_run = kwargs.pop('register_dry_run')
        super(SchemaEventHandler, self).__init__(*args, **kwargs)
        self.schema_tracker = SchemaTracker(self.db_connections)
        self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)

    def handle_event(self, event, position):
        """Handles schema change queries. For queries that alter schema,
        it also registers the altered schemas with the schematizer.
        If the event is blacklisted or the query is skippable or the
        query statement is not supported, the method doesn't handle it.
        Args:
            event: The event containing the query
            position: The current position (for saving state)
        """
        statement = mysql_statement_factory(event.query)
        if self._event_can_be_skipped(event, statement):
            return
        query = event.query
        schema = event.schema

        logger.info("Processing supported query {q}".format(q=query))

        if self.stats_counter:
            self.stats_counter.increment(query)

        logger.info("Flushing all messages from producer and saving position")
        self.producer.flush()
        save_position(
            position_data=self.producer.get_checkpoint_position_data(),
            state_session=self.db_connections.state_session
        )

        if not self.mysql_dump_handler.mysql_dump_exists():
            # For first time schema event backup
            self.mysql_dump_handler.create_schema_dump()
            self.mysql_dump_handler.persist_schema_dump()

        if self._is_query_alter_and_not_rename_table(statement):
            # TODO: DATAPIPE-1963
            if schema is None or not schema.strip():
                database_name = statement.database_name
            else:
                database_name = schema

            if self.is_blacklisted(event, database_name):
                # This blacklist check needs to be called again here, because if
                # the statement doesn't have a concrete schema assigned, we
                # won't know if it should be executed until this point.
                logger.info("Query {e} is blacklisted, skip processing".format(
                    e=event.query
                ))
                return

            table = Table(
                cluster_name=self.db_connections.source_cluster_name,
                database_name=database_name,
                table_name=statement.table
            )
            self._process_alter_table_event(
                query=query,
                table=table
            )

            self._checkpoint(
                position=position.to_dict(),
                event_type=EventType.SCHEMA_EVENT,
                cluster_name=table.cluster_name,
                database_name=table.database_name,
                table_name=table.table_name
            )
        else:
            if self._does_query_rename_table(statement):
                logger.info(
                    "Rename query {q} detected, clearing schema cache".format(
                        q=query
                    )
                )
                self.schema_wrapper.reset_cache()

            database_name = self._get_db_for_statement(statement, schema)
            self._execute_query(query=query, database_name=database_name)

            self._checkpoint(
                position=position.to_dict(),
                event_type=EventType.SCHEMA_EVENT,
                cluster_name=self.db_connections.source_cluster_name,
                database_name=schema,
                table_name=None
            )

    def _get_db_for_statement(self, statement, schema):
        database_name = None if isinstance(statement, CreateDatabaseStatement) \
            else schema
        return database_name

    def _event_can_be_skipped(self, event, statement):
        skippable_queries = {'BEGIN', 'COMMIT'}
        if event.query in skippable_queries:
            return True

        if self.is_blacklisted(event=event, schema=event.schema):
            return True

        if not statement.is_supported():
            logger.debug("The statement {s} is not supported".format(
                s=type(statement)
            ))
            return True
        return False

    def _process_alter_table_event(self, query, table):
        """
        This executes the alter table query and registers the query with
        the schematizer.
        Args:
            query: Has to be an AlterTable query
            table: Table on which the query has to be executed on
        """
        logger.info("Processing an alter table query {q}".format(q=query))
        table_before_processing = self.schema_tracker.get_show_create_statement(
            table=table
        )
        self._execute_query(query=query, database_name=table.database_name)
        table_after_processing = self.schema_tracker.get_show_create_statement(
            table=table
        )
        self.schema_wrapper.register_with_schema_store(
            table=table,
            new_create_table_stmt=table_after_processing.query,
            old_create_table_stmt=table_before_processing.query,
            alter_table_stmt=query
        )

    def _execute_query(self, query, database_name):
        self.schema_tracker.execute_query(
            query=query,
            database_name=database_name
        )

    def _checkpoint(
        self,
        position,
        event_type,
        cluster_name,
        database_name,
        table_name,
    ):
        # Split creating and persisting dump to minimize time between updated
        # global event state and new dump being saved.
        self.mysql_dump_handler.create_schema_dump()
        with self.db_connections.state_session.connect_begin(ro=False) as session:
            GlobalEventState.upsert(
                session=session,
                position=position,
                event_type=event_type,
                cluster_name=cluster_name,
                database_name=database_name,
                table_name=table_name
            )
        return self.mysql_dump_handler.persist_schema_dump()

    def _is_query_alter_and_not_rename_table(self, statement):
        return isinstance(
            statement,
            AlterTableStatement
        ) and not statement.does_rename_table()

    def _does_query_rename_table(self, statement):
        return isinstance(
            statement,
            AlterTableStatement
        ) and statement.does_rename_table() or isinstance(
            statement,
            RenameTableStatement
        )
コード例 #6
0
 def __init__(self, *args, **kwargs):
     self.register_dry_run = kwargs.pop('register_dry_run')
     super(SchemaEventHandler, self).__init__(*args, **kwargs)
     self.schema_tracker = SchemaTracker(self.db_connections)
     self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)
コード例 #7
0
class SchemaEventHandler(BaseEventHandler):
    """Process all incoming schema changes
    """
    def __init__(self, *args, **kwargs):
        self.register_dry_run = kwargs.pop('register_dry_run')
        super(SchemaEventHandler, self).__init__(*args, **kwargs)
        self.schema_tracker = SchemaTracker(self.db_connections)
        self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)

    def handle_event(self, event, position):
        """Handles schema change queries. For queries that alter schema,
        it also registers the altered schemas with the schematizer.
        If the event is blacklisted or the query is skippable or the
        query statement is not supported, the method doesn't handle it.
        Args:
            event: The event containing the query
            position: The current position (for saving state)
        """
        statement = mysql_statement_factory(event.query)
        if self._event_can_be_skipped(event, statement):
            return

        query = event.query
        schema = event.schema

        logger.info("Processing supported query {q}".format(q=query))

        if self.stats_counter:
            self.stats_counter.increment(query)

        logger.info("Flushing all messages from producer and saving position")
        self.producer.flush()
        save_position(
            position_data=self.producer.get_checkpoint_position_data(),
            state_session=self.db_connections.state_session)

        self.mysql_dump_handler.create_and_persist_schema_dump()

        if self._is_query_alter_and_not_rename_table(statement):
            # TODO: DATAPIPE-1963
            if schema is None or not schema.strip():
                database_name = statement.database_name
            else:
                database_name = schema

            if self.is_blacklisted(event, database_name):
                # This blacklist check needs to be called again here, because if
                # the statement doesn't have a concrete schema assigned, we
                # won't know if it should be executed until this point.
                logger.info("Query {e} is blacklisted, skip processing".format(
                    e=event.query))
                return

            table = Table(cluster_name=self.db_connections.source_cluster_name,
                          database_name=database_name,
                          table_name=statement.table)
            self._process_alter_table_event(query=query, table=table)

            self._checkpoint(position=position.to_dict(),
                             event_type=EventType.SCHEMA_EVENT,
                             cluster_name=table.cluster_name,
                             database_name=table.database_name,
                             table_name=table.table_name)
        else:
            if self._does_query_rename_table(statement):
                logger.info(
                    "Rename query {q} detected, clearing schema cache".format(
                        q=query))
                self.schema_wrapper.reset_cache()

            database_name = self._get_db_for_statement(statement, schema)
            self._execute_query(query=query, database_name=database_name)

            self._checkpoint(
                position=position.to_dict(),
                event_type=EventType.SCHEMA_EVENT,
                cluster_name=self.db_connections.source_cluster_name,
                database_name=schema,
                table_name=None)

    def _get_db_for_statement(self, statement, schema):
        database_name = None if isinstance(statement, CreateDatabaseStatement) \
            else schema
        return database_name

    def _event_can_be_skipped(self, event, statement):
        skippable_queries = {'BEGIN', 'COMMIT'}
        if event.query in skippable_queries:
            return True

        if self.is_blacklisted(event=event, schema=event.schema):
            return True

        if not statement.is_supported():
            logger.debug(
                "The statement {s} is not supported".format(s=type(statement)))
            return True
        return False

    def _process_alter_table_event(self, query, table):
        """
        This executes the alter table query and registers the query with
        the schematizer.
        Args:
            query: Has to be an AlterTable query
            table: Table on which the query has to be executed on
        """
        logger.info("Processing an alter table query {q}".format(q=query))
        table_before_processing = self.schema_tracker.get_show_create_statement(
            table=table)
        self._execute_query(query=query, database_name=table.database_name)
        table_after_processing = self.schema_tracker.get_show_create_statement(
            table=table)
        self.schema_wrapper.register_with_schema_store(
            table=table,
            new_create_table_stmt=table_after_processing.query,
            old_create_table_stmt=table_before_processing.query,
            alter_table_stmt=query)

    def _execute_query(self, query, database_name):
        self.schema_tracker.execute_query(query=query,
                                          database_name=database_name)

    def _checkpoint(
        self,
        position,
        event_type,
        cluster_name,
        database_name,
        table_name,
    ):
        with self.db_connections.state_session.connect_begin(
                ro=False) as session:
            GlobalEventState.upsert(session=session,
                                    position=position,
                                    event_type=event_type,
                                    cluster_name=cluster_name,
                                    database_name=database_name,
                                    table_name=table_name)
            self.mysql_dump_handler.delete_persisted_dump(
                active_session=session)

    def _is_query_alter_and_not_rename_table(self, statement):
        return isinstance(
            statement,
            AlterTableStatement) and not statement.does_rename_table()

    def _does_query_rename_table(self, statement):
        return isinstance(statement, AlterTableStatement
                          ) and statement.does_rename_table() or isinstance(
                              statement, RenameTableStatement)
コード例 #8
0
 def base_schema_tracker(self, mock_db_connections):
     return SchemaTracker(mock_db_connections)