def test_recovery_from_schema_dump(self, create_table_query,
                                       setup_db_and_get_cursor,
                                       mock_db_connections):
        """Inserts two table schemas in schema tracker db. Then tests if the
        dump is created successfully and is persisted in the state db.
        Then deletes one table and checks if the recovery process works.
        """
        tracker_cursor = setup_db_and_get_cursor
        mock_mysql_dump_handler = MySQLDumpHandler(mock_db_connections)
        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert not dump_exists

        mock_mysql_dump_handler.create_schema_dump()
        mock_mysql_dump_handler.persist_schema_dump()
        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert dump_exists

        tracker_cursor.execute('drop table one')
        tracker_cursor.execute('show tables')
        all_tables = tracker_cursor.fetchall()
        assert ('one', ) not in all_tables
        assert ('two', ) in all_tables

        mock_mysql_dump_handler.recover()
        tracker_cursor.execute('show tables')
        all_tables = tracker_cursor.fetchall()
        assert ('one', ) in all_tables
        assert ('two', ) in all_tables

        self.cleanup(mock_mysql_dump_handler, mock_db_connections)
Exemple #2
0
    def __init__(
        self,
        stream,
        producer,
        schema_wrapper,
        db_connections,
        is_clean_shutdown=False,
        register_dry_run=False,
        publish_dry_run=False,
        changelog_mode=False,
        gtid_enabled=False
    ):
        self.db_connections = db_connections
        log.info("Recovery Handler Starting: %s" % json.dumps(dict(
            is_clean_shutdown=is_clean_shutdown,
            source_cluster_name=self.db_connections.source_cluster_name,
            register_dry_run=register_dry_run,
            publish_dry_run=publish_dry_run,
            changelog_mode=changelog_mode,
            gtid_enabled=gtid_enabled
        )))

        self.stream = stream
        self.producer = producer
        self.is_clean_shutdown = is_clean_shutdown
        self.register_dry_run = register_dry_run
        self.publish_dry_run = publish_dry_run
        self.schema_wrapper = schema_wrapper
        self.latest_source_log_position = self.get_latest_source_log_position()
        self.changelog_mode = changelog_mode
        self.gtid_enabled = gtid_enabled
        self.transaction_id_schema_id = get_transaction_id_schema_id(gtid_enabled)
        self.changelog_schema_wrapper = self._get_changelog_schema_wrapper()
        self.mysql_dump_handler = MySQLDumpHandler(db_connections)
 def test_double_create_dump(self, create_table_query, mock_db_connections,
                             setup_db_and_get_cursor):
     mock_mysql_dump_handler = MySQLDumpHandler(mock_db_connections)
     mock_mysql_dump_handler.create_schema_dump()
     dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
     assert not dump_exists
     with pytest.raises(ValueError):
         mock_mysql_dump_handler.create_schema_dump()
    def test_create_and_persist_dump(self, create_table_query,
                                     mock_db_connections,
                                     setup_db_and_get_cursor):
        mock_mysql_dump_handler = MySQLDumpHandler(mock_db_connections)

        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert not dump_exists

        mock_mysql_dump_handler.create_schema_dump()
        mock_mysql_dump_handler.persist_schema_dump()
        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert dump_exists

        self.cleanup(mock_mysql_dump_handler, mock_db_connections)
 def __init__(self, *args, **kwargs):
     self.register_dry_run = kwargs.pop('register_dry_run')
     super(SchemaEventHandler, self).__init__(*args, **kwargs)
     self.schema_tracker = SchemaTracker(self.db_connections)
     self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)
class SchemaEventHandler(BaseEventHandler):
    """Process all incoming schema changes
    """

    def __init__(self, *args, **kwargs):
        self.register_dry_run = kwargs.pop('register_dry_run')
        super(SchemaEventHandler, self).__init__(*args, **kwargs)
        self.schema_tracker = SchemaTracker(self.db_connections)
        self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)

    def handle_event(self, event, position):
        """Handles schema change queries. For queries that alter schema,
        it also registers the altered schemas with the schematizer.
        If the event is blacklisted or the query is skippable or the
        query statement is not supported, the method doesn't handle it.
        Args:
            event: The event containing the query
            position: The current position (for saving state)
        """
        statement = mysql_statement_factory(event.query)
        if self._event_can_be_skipped(event, statement):
            return
        query = event.query
        schema = event.schema

        logger.info("Processing supported query {q}".format(q=query))

        if self.stats_counter:
            self.stats_counter.increment(query)

        logger.info("Flushing all messages from producer and saving position")
        self.producer.flush()
        save_position(
            position_data=self.producer.get_checkpoint_position_data(),
            state_session=self.db_connections.state_session
        )

        if not self.mysql_dump_handler.mysql_dump_exists():
            # For first time schema event backup
            self.mysql_dump_handler.create_schema_dump()
            self.mysql_dump_handler.persist_schema_dump()

        if self._is_query_alter_and_not_rename_table(statement):
            # TODO: DATAPIPE-1963
            if schema is None or not schema.strip():
                database_name = statement.database_name
            else:
                database_name = schema

            if self.is_blacklisted(event, database_name):
                # This blacklist check needs to be called again here, because if
                # the statement doesn't have a concrete schema assigned, we
                # won't know if it should be executed until this point.
                logger.info("Query {e} is blacklisted, skip processing".format(
                    e=event.query
                ))
                return

            table = Table(
                cluster_name=self.db_connections.source_cluster_name,
                database_name=database_name,
                table_name=statement.table
            )
            self._process_alter_table_event(
                query=query,
                table=table
            )

            self._checkpoint(
                position=position.to_dict(),
                event_type=EventType.SCHEMA_EVENT,
                cluster_name=table.cluster_name,
                database_name=table.database_name,
                table_name=table.table_name
            )
        else:
            if self._does_query_rename_table(statement):
                logger.info(
                    "Rename query {q} detected, clearing schema cache".format(
                        q=query
                    )
                )
                self.schema_wrapper.reset_cache()

            database_name = self._get_db_for_statement(statement, schema)
            self._execute_query(query=query, database_name=database_name)

            self._checkpoint(
                position=position.to_dict(),
                event_type=EventType.SCHEMA_EVENT,
                cluster_name=self.db_connections.source_cluster_name,
                database_name=schema,
                table_name=None
            )

    def _get_db_for_statement(self, statement, schema):
        database_name = None if isinstance(statement, CreateDatabaseStatement) \
            else schema
        return database_name

    def _event_can_be_skipped(self, event, statement):
        skippable_queries = {'BEGIN', 'COMMIT'}
        if event.query in skippable_queries:
            return True

        if self.is_blacklisted(event=event, schema=event.schema):
            return True

        if not statement.is_supported():
            logger.debug("The statement {s} is not supported".format(
                s=type(statement)
            ))
            return True
        return False

    def _process_alter_table_event(self, query, table):
        """
        This executes the alter table query and registers the query with
        the schematizer.
        Args:
            query: Has to be an AlterTable query
            table: Table on which the query has to be executed on
        """
        logger.info("Processing an alter table query {q}".format(q=query))
        table_before_processing = self.schema_tracker.get_show_create_statement(
            table=table
        )
        self._execute_query(query=query, database_name=table.database_name)
        table_after_processing = self.schema_tracker.get_show_create_statement(
            table=table
        )
        self.schema_wrapper.register_with_schema_store(
            table=table,
            new_create_table_stmt=table_after_processing.query,
            old_create_table_stmt=table_before_processing.query,
            alter_table_stmt=query
        )

    def _execute_query(self, query, database_name):
        self.schema_tracker.execute_query(
            query=query,
            database_name=database_name
        )

    def _checkpoint(
        self,
        position,
        event_type,
        cluster_name,
        database_name,
        table_name,
    ):
        # Split creating and persisting dump to minimize time between updated
        # global event state and new dump being saved.
        self.mysql_dump_handler.create_schema_dump()
        with self.db_connections.state_session.connect_begin(ro=False) as session:
            GlobalEventState.upsert(
                session=session,
                position=position,
                event_type=event_type,
                cluster_name=cluster_name,
                database_name=database_name,
                table_name=table_name
            )
        return self.mysql_dump_handler.persist_schema_dump()

    def _is_query_alter_and_not_rename_table(self, statement):
        return isinstance(
            statement,
            AlterTableStatement
        ) and not statement.does_rename_table()

    def _does_query_rename_table(self, statement):
        return isinstance(
            statement,
            AlterTableStatement
        ) and statement.does_rename_table() or isinstance(
            statement,
            RenameTableStatement
        )
 def __init__(self, *args, **kwargs):
     self.register_dry_run = kwargs.pop('register_dry_run')
     super(SchemaEventHandler, self).__init__(*args, **kwargs)
     self.schema_tracker = SchemaTracker(self.db_connections)
     self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)
class SchemaEventHandler(BaseEventHandler):
    """Process all incoming schema changes
    """
    def __init__(self, *args, **kwargs):
        self.register_dry_run = kwargs.pop('register_dry_run')
        super(SchemaEventHandler, self).__init__(*args, **kwargs)
        self.schema_tracker = SchemaTracker(self.db_connections)
        self.mysql_dump_handler = MySQLDumpHandler(self.db_connections)

    def handle_event(self, event, position):
        """Handles schema change queries. For queries that alter schema,
        it also registers the altered schemas with the schematizer.
        If the event is blacklisted or the query is skippable or the
        query statement is not supported, the method doesn't handle it.
        Args:
            event: The event containing the query
            position: The current position (for saving state)
        """
        statement = mysql_statement_factory(event.query)
        if self._event_can_be_skipped(event, statement):
            return

        query = event.query
        schema = event.schema

        logger.info("Processing supported query {q}".format(q=query))

        if self.stats_counter:
            self.stats_counter.increment(query)

        logger.info("Flushing all messages from producer and saving position")
        self.producer.flush()
        save_position(
            position_data=self.producer.get_checkpoint_position_data(),
            state_session=self.db_connections.state_session)

        self.mysql_dump_handler.create_and_persist_schema_dump()

        if self._is_query_alter_and_not_rename_table(statement):
            # TODO: DATAPIPE-1963
            if schema is None or not schema.strip():
                database_name = statement.database_name
            else:
                database_name = schema

            if self.is_blacklisted(event, database_name):
                # This blacklist check needs to be called again here, because if
                # the statement doesn't have a concrete schema assigned, we
                # won't know if it should be executed until this point.
                logger.info("Query {e} is blacklisted, skip processing".format(
                    e=event.query))
                return

            table = Table(cluster_name=self.db_connections.source_cluster_name,
                          database_name=database_name,
                          table_name=statement.table)
            self._process_alter_table_event(query=query, table=table)

            self._checkpoint(position=position.to_dict(),
                             event_type=EventType.SCHEMA_EVENT,
                             cluster_name=table.cluster_name,
                             database_name=table.database_name,
                             table_name=table.table_name)
        else:
            if self._does_query_rename_table(statement):
                logger.info(
                    "Rename query {q} detected, clearing schema cache".format(
                        q=query))
                self.schema_wrapper.reset_cache()

            database_name = self._get_db_for_statement(statement, schema)
            self._execute_query(query=query, database_name=database_name)

            self._checkpoint(
                position=position.to_dict(),
                event_type=EventType.SCHEMA_EVENT,
                cluster_name=self.db_connections.source_cluster_name,
                database_name=schema,
                table_name=None)

    def _get_db_for_statement(self, statement, schema):
        database_name = None if isinstance(statement, CreateDatabaseStatement) \
            else schema
        return database_name

    def _event_can_be_skipped(self, event, statement):
        skippable_queries = {'BEGIN', 'COMMIT'}
        if event.query in skippable_queries:
            return True

        if self.is_blacklisted(event=event, schema=event.schema):
            return True

        if not statement.is_supported():
            logger.debug(
                "The statement {s} is not supported".format(s=type(statement)))
            return True
        return False

    def _process_alter_table_event(self, query, table):
        """
        This executes the alter table query and registers the query with
        the schematizer.
        Args:
            query: Has to be an AlterTable query
            table: Table on which the query has to be executed on
        """
        logger.info("Processing an alter table query {q}".format(q=query))
        table_before_processing = self.schema_tracker.get_show_create_statement(
            table=table)
        self._execute_query(query=query, database_name=table.database_name)
        table_after_processing = self.schema_tracker.get_show_create_statement(
            table=table)
        self.schema_wrapper.register_with_schema_store(
            table=table,
            new_create_table_stmt=table_after_processing.query,
            old_create_table_stmt=table_before_processing.query,
            alter_table_stmt=query)

    def _execute_query(self, query, database_name):
        self.schema_tracker.execute_query(query=query,
                                          database_name=database_name)

    def _checkpoint(
        self,
        position,
        event_type,
        cluster_name,
        database_name,
        table_name,
    ):
        with self.db_connections.state_session.connect_begin(
                ro=False) as session:
            GlobalEventState.upsert(session=session,
                                    position=position,
                                    event_type=event_type,
                                    cluster_name=cluster_name,
                                    database_name=database_name,
                                    table_name=table_name)
            self.mysql_dump_handler.delete_persisted_dump(
                active_session=session)

    def _is_query_alter_and_not_rename_table(self, statement):
        return isinstance(
            statement,
            AlterTableStatement) and not statement.does_rename_table()

    def _does_query_rename_table(self, statement):
        return isinstance(statement, AlterTableStatement
                          ) and statement.does_rename_table() or isinstance(
                              statement, RenameTableStatement)
Exemple #9
0
class RecoveryHandler(object):
    """ This class handles the recovery process, including recreate table and position
    stream to correct offset, and publish left over messages. When recover process finishes,
    the stream should be ready to be consumed.

    Args:
      stream(SimpleBinlogStreamReaderWrapper object): a stream reader
      producer(data_pipe.producer.Producer object): producer object from data pipeline, since
        we might need to publish unpublished messages.
      schema_wrapper(SchemaWrapper object): a wrapper for communication with schematizer.
      db_connections(BaseConnection object): a wrapper for communication with all Databases.
      is_clean_shutdown(boolean): whether the last operation was cleanly stopped.
      pending_schema_event(SchemaEventState object): schema event that has a pending state
      register_dry_run(boolean): whether a schema has to be registered for a message to be published.
      publish_dry_run(boolean): whether actually publishing a message or not.
      changelog_mode(boolean): If True, executes change_log flow (default: false)
    """

    def __init__(
        self,
        stream,
        producer,
        schema_wrapper,
        db_connections,
        is_clean_shutdown=False,
        register_dry_run=False,
        publish_dry_run=False,
        changelog_mode=False,
        gtid_enabled=False
    ):
        self.db_connections = db_connections
        log.info("Recovery Handler Starting: %s" % json.dumps(dict(
            is_clean_shutdown=is_clean_shutdown,
            source_cluster_name=self.db_connections.source_cluster_name,
            register_dry_run=register_dry_run,
            publish_dry_run=publish_dry_run,
            changelog_mode=changelog_mode,
            gtid_enabled=gtid_enabled
        )))

        self.stream = stream
        self.producer = producer
        self.is_clean_shutdown = is_clean_shutdown
        self.register_dry_run = register_dry_run
        self.publish_dry_run = publish_dry_run
        self.schema_wrapper = schema_wrapper
        self.latest_source_log_position = self.get_latest_source_log_position()
        self.changelog_mode = changelog_mode
        self.gtid_enabled = gtid_enabled
        self.transaction_id_schema_id = get_transaction_id_schema_id(gtid_enabled)
        self.changelog_schema_wrapper = self._get_changelog_schema_wrapper()
        self.mysql_dump_handler = MySQLDumpHandler(db_connections)

    @property
    def need_recovery(self):
        """Determine if recovery procedure is needed.
        """
        return not self.is_clean_shutdown or self.mysql_dump_handler.mysql_dump_exists()

    def _get_changelog_schema_wrapper(self):
        """Get schema wrapper object for changelog flow. Note schema wrapper
        for this flow is independent of event (and hence, independent of table)
        """
        if not self.changelog_mode:
            return None
        change_log_data_event_handler = ChangeLogDataEventHandler(
            db_connections=self.db_connections,
            producer=self.producer,
            schema_wrapper=self.schema_wrapper,
            stats_counter=None,
            register_dry_run=self.register_dry_run,
            gtid_enabled=self.gtid_enabled
        )
        return change_log_data_event_handler.schema_wrapper_entry

    def get_latest_source_log_position(self):
        with self.db_connections.get_source_cursor() as cursor:
            cursor.execute("show master status")
            result = cursor.fetchone()
        # result is a tuple with file name at pos 0, and position at pos 1.
        log.info("The latest master log position is {log_file}: {log_pos}".format(
            log_file=result[0],
            log_pos=result[1],
        ))
        return LogPosition(log_file=result[0], log_pos=result[1])

    def recover(self):
        """ Handles the recovery procedure. """
        if self.mysql_dump_handler.mysql_dump_exists():
            self.mysql_dump_handler.recover()
        self._handle_unclean_shutdown()

    def _handle_unclean_shutdown(self):
        if not self.is_clean_shutdown:
            self._recover_from_unclean_shutdown(self.stream)

    def _recover_from_unclean_shutdown(self, stream):
        events = []
        log.info("Recovering from unclean shutdown.")
        while len(events) < env_config.recovery_queue_size:
            event = stream.peek().event
            if not isinstance(event, DataEvent):
                if self._is_unsupported_query_event(event):
                    stream.next()
                    continue
                # Encounter supported non-data event, we should stop accumulating more events.
                log.info("Recovery halted for non-data event: %s %s" % (
                    repr(event), event.query
                ))
                break
            log.info("Recovery event for %s" % event.table)
            replication_handler_event = stream.next()
            events.append(replication_handler_event)
            if self._already_caught_up(replication_handler_event):
                break
        log.info("Recovering with %s events" % len(events))
        if events:
            self._ensure_message_published_and_checkpoint(events)

    def _ensure_message_published_and_checkpoint(self, events):
        topic_offsets = self._get_topic_offsets_map_for_cluster()
        messages = self._build_messages(events)
        self.producer.ensure_messages_published(messages, topic_offsets)
        position_data = self.producer.get_checkpoint_position_data()
        save_position(
            state_session=self.db_connections.state_session,
            position_data=position_data
        )

    def _already_caught_up(self, rh_event):
        # when we catch up with the latest position, we should stop accumulating more events.
        if (
            rh_event.position.log_file == self.latest_source_log_position.log_file and
            rh_event.position.log_pos >= self.latest_source_log_position.log_pos
        ):
            log.info("We caught up with real time, halt recovery.")
            return True
        return False

    def _is_unsupported_query_event(self, event):
        if (
            isinstance(event, QueryEvent) and
            not mysql_statement_factory(event.query).is_supported()
        ):
            log.info("Filtered unsupported query event: {} {}".format(
                repr(event),
                event.query
            ))
            return True
        return False

    def _get_schema_wrapper(self, event):
        """Get schema wrapper object for the current event.
        """
        table = Table(
            cluster_name=self.db_connections.source_cluster_name,
            table_name=event.event.table,
            database_name=event.event.schema
        )
        return self.schema_wrapper[table]

    def _build_messages(self, events):
        messages = []
        Builder = (MessageBuilder
                   if not self.changelog_mode else ChangeLogMessageBuilder)
        for event in events:
            # event here is ReplicationHandlerEvent
            schema_wrapper = (self._get_schema_wrapper(event)
                              if not self.changelog_mode else self.changelog_schema_wrapper)
            builder = Builder(
                schema_wrapper,
                event.event,
                self.transaction_id_schema_id,
                event.position,
                self.register_dry_run,
            )

            messages.append(builder.build_message(
                self.db_connections.source_cluster_name
            ))
        return messages

    def _get_topic_offsets_map_for_cluster(self):
        with self.db_connections.state_session.connect_begin(ro=True) as session:
            topic_offsets = DataEventCheckpoint.get_topic_to_kafka_offset_map(
                session,
                self.db_connections.source_cluster_name
            )
        return topic_offsets
 def test_persist_with_no_dump(self, create_table_query,
                               mock_db_connections,
                               setup_db_and_get_cursor):
     mock_mysql_dump_handler = MySQLDumpHandler(mock_db_connections)
     with pytest.raises(ValueError):
         mock_mysql_dump_handler.persist_schema_dump()
    def test_create_two_dumps(self, create_table_query, mock_db_connections,
                              setup_db_and_get_cursor):
        mock_mysql_dump_handler = MySQLDumpHandler(mock_db_connections)
        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert not dump_exists

        mock_mysql_dump_handler.create_schema_dump()
        mock_mysql_dump_handler.persist_schema_dump()
        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert dump_exists

        mock_mysql_dump_handler.recover()
        dump_exists = mock_mysql_dump_handler.mysql_dump_exists()
        assert dump_exists

        mock_mysql_dump_handler.create_schema_dump()
        mock_mysql_dump_handler.persist_schema_dump()
        # Creating another dump should cause us to only have one dump
        assert self.get_number_of_dumps(mock_db_connections) == 1

        self.cleanup(mock_mysql_dump_handler, mock_db_connections)