コード例 #1
0
ファイル: interface.py プロジェクト: tx2z/monger
    def retry_until_wtimeout(self, insert_fn):
        """Retry until wtimeout reached.

        Given a callback function representing an insert operation on
        the primary, handle any connection failures, and keep retrying
        the operation for up to 'AWAIT_REPL_TIMEOUT_MINS' minutes.

        The insert operation callback should take an argument for the
        number of remaining seconds to provide as the timeout for the
        operation.
        """

        deadline = time.time() + ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60

        while True:
            try:
                remaining = deadline - time.time()
                insert_fn(remaining)
                break
            except pymonger.errors.ConnectionFailure:
                remaining = deadline - time.time()
                if remaining <= 0.0:
                    message = "Failed to connect to {} within {} minutes".format(
                        self.get_driver_connection_url(), ReplFixture.AWAIT_REPL_TIMEOUT_MINS)
                    self.logger.error(message)
                    raise errors.ServerFailure(message)
            except pymonger.errors.WTimeoutError:
                message = "Replication of write operation timed out."
                self.logger.error(message)
                raise errors.ServerFailure(message)
            except pymonger.errors.PyMongerError as err:
                message = "Write operation on {} failed: {}".format(
                    self.get_driver_connection_url(), err)
                raise errors.ServerFailure(message)
コード例 #2
0
ファイル: shardedcluster.py プロジェクト: tx2z/monger
    def _do_teardown(self):
        if self.mongers is None:
            self.logger.warning("The mongers fixture has not been set up yet.")
            return  # Teardown is still a success even if nothing is running.

        self.logger.info("Stopping mongers on port %d with pid %d...",
                         self.port, self.mongers.pid)
        if not self.is_running():
            exit_code = self.mongers.poll()
            msg = (
                "mongers on port {:d} was expected to be running, but wasn't. "
                "Process exited with code {:d}").format(self.port, exit_code)
            self.logger.warning(msg)
            raise errors.ServerFailure(msg)

        self.mongers.stop()
        exit_code = self.mongers.wait()

        if exit_code == 0:
            self.logger.info(
                "Successfully stopped the mongers on port {:d}".format(
                    self.port))
        else:
            self.logger.warning("Stopped the mongers on port {:d}. "
                                "Process exited with code {:d}.".format(
                                    self.port, exit_code))
            raise errors.ServerFailure(
                "mongers on port {:d} with pid {:d} exited with code {:d}".
                format(self.port, self.mongers.pid, exit_code))
コード例 #3
0
    def await_ready(self):
        """Block until the fixture can be used for testing."""
        deadline = time.time() + MongerDFixture.AWAIT_READY_TIMEOUT_SECS

        # Wait until the mongerd is accepting connections. The retry logic is necessary to support
        # versions of PyMonger <3.0 that immediately raise a ConnectionFailure if a connection cannot
        # be established.
        while True:
            # Check whether the mongerd exited for some reason.
            exit_code = self.mongerd.poll()
            if exit_code is not None:
                raise errors.ServerFailure(
                    "Could not connect to mongerd on port {}, process ended"
                    " unexpectedly with code {}.".format(self.port, exit_code))

            try:
                # Use a shorter connection timeout to more closely satisfy the requested deadline.
                client = self.monger_client(timeout_millis=500)
                client.admin.command("ping")
                break
            except pymonger.errors.ConnectionFailure:
                remaining = deadline - time.time()
                if remaining <= 0.0:
                    raise errors.ServerFailure(
                        "Failed to connect to mongerd on port {} after {} seconds"
                        .format(self.port,
                                MongerDFixture.AWAIT_READY_TIMEOUT_SECS))

                self.logger.info("Waiting to connect to mongerd on port %d.",
                                 self.port)
                time.sleep(0.1)  # Wait a little bit before trying again.

        self.logger.info("Successfully contacted the mongerd on port %d.",
                         self.port)
コード例 #4
0
ファイル: shardedcluster.py プロジェクト: tx2z/monger
    def setup(self):
        """Set up the sharded cluster."""
        if "port" not in self.mongers_options:
            self.mongers_options[
                "port"] = core.network.PortAllocator.next_fixture_port(
                    self.job_num)
        self.port = self.mongers_options["port"]

        mongers = core.programs.mongers_program(
            self.logger,
            executable=self.mongers_executable,
            **self.mongers_options)
        try:
            self.logger.info("Starting mongers on port %d...\n%s", self.port,
                             mongers.as_command())
            mongers.start()
            self.logger.info("mongers started on port %d with pid %d.",
                             self.port, mongers.pid)
        except Exception as err:
            msg = "Failed to start mongers on port {:d}: {}".format(
                self.port, err)
            self.logger.exception(msg)
            raise errors.ServerFailure(msg)

        self.mongers = mongers
コード例 #5
0
ファイル: shardedcluster.py プロジェクト: tx2z/monger
    def _do_teardown(self):
        """Shut down the sharded cluster."""
        self.logger.info("Stopping all members of the sharded cluster...")

        running_at_start = self.is_running()
        if not running_at_start:
            self.logger.warning(
                "All members of the sharded cluster were expected to be running, "
                "but weren't.")

        if self.enable_balancer:
            self.stop_balancer()

        teardown_handler = interface.FixtureTeardownHandler(self.logger)

        if self.configsvr is not None:
            teardown_handler.teardown(self.configsvr, "config server")

        for mongers in self.mongers:
            teardown_handler.teardown(mongers, "mongers")

        for shard in self.shards:
            teardown_handler.teardown(shard, "shard")

        if teardown_handler.was_successful():
            self.logger.info(
                "Successfully stopped all members of the sharded cluster.")
        else:
            self.logger.error("Stopping the sharded cluster fixture failed.")
            raise errors.ServerFailure(teardown_handler.get_error_message())
コード例 #6
0
    def _check_secondaries_and_restart_fixture(self):
        preserve_dbpaths = []
        for node in self.fixture.nodes:
            preserve_dbpaths.append(node.preserve_dbpath)
            node.preserve_dbpath = True

        for secondary in self.fixture.get_secondaries():
            self._check_invariants_as_standalone(secondary)

            self.logger.info(
                "Restarting the secondary on port %d as a replica set node with"
                " its data files intact...", secondary.port)
            # Start the 'secondary' mongerd back up as part of the replica set and wait for it to
            # reach state SECONDARY.
            secondary.setup()
            secondary.await_ready()
            self._await_secondary_state(secondary)

            try:
                secondary.teardown()
            except errors.ServerFailure:
                raise errors.ServerFailure(
                    "{} did not exit cleanly after reconciling the end of its oplog".format(
                        secondary))

        self.logger.info("Starting the fixture back up again with its data files intact for final"
                         " validation...")

        try:
            self.fixture.setup()
            self.fixture.await_ready()
        finally:
            for (i, node) in enumerate(self.fixture.nodes):
                node.preserve_dbpath = preserve_dbpaths[i]
コード例 #7
0
    def _kill_secondaries(self):
        for secondary in self.fixture.get_secondaries():
            # Disable the "rsSyncApplyStop" failpoint on the secondary to have it resume applying
            # oplog entries.
            self._hook._disable_rssyncapplystop(secondary)  # pylint: disable=protected-access

            # Wait a little bit for the secondary to start apply oplog entries so that we are more
            # likely to kill the mongerd process while it is partway into applying a batch.
            time.sleep(0.1)

            # Check that the secondary is still running before forcibly terminating it. This ensures
            # we still detect some cases in which the secondary has already crashed.
            if not secondary.is_running():
                raise errors.ServerFailure(
                    "mongerd on port {} was expected to be running in"
                    " PeriodicKillSecondaries.after_test(), but wasn't.".format(secondary.port))

            self.logger.info("Killing the secondary on port %d...", secondary.port)
            secondary.mongerd.stop(kill=True)

        try:
            self.fixture.teardown()
        except errors.ServerFailure:
            # Teardown may or may not be considered a success as a result of killing a secondary,
            # so we ignore ServerFailure raised during teardown.
            pass
コード例 #8
0
    def setup(self):
        """Set up the mongerd."""
        if not self.preserve_dbpath and os.path.lexists(self._dbpath):
            utils.rmtree(self._dbpath, ignore_errors=False)

        try:
            os.makedirs(self._dbpath)
        except os.error:
            # Directory already exists.
            pass

        if "port" not in self.mongerd_options:
            self.mongerd_options[
                "port"] = core.network.PortAllocator.next_fixture_port(
                    self.job_num)
        self.port = self.mongerd_options["port"]

        mongerd = core.programs.mongerd_program(
            self.logger,
            executable=self.mongerd_executable,
            **self.mongerd_options)
        try:
            self.logger.info("Starting mongerd on port %d...\n%s", self.port,
                             mongerd.as_command())
            mongerd.start()
            self.logger.info("mongerd started on port %d with pid %d.",
                             self.port, mongerd.pid)
        except Exception as err:
            msg = "Failed to start mongerd on port {:d}: {}".format(
                self.port, err)
            self.logger.exception(msg)
            raise errors.ServerFailure(msg)

        self.mongerd = mongerd
コード例 #9
0
 def _enable_rssyncapplystop(self, secondary):
     # Enable the "rsSyncApplyStop" failpoint on the secondary to prevent them from
     # applying any oplog entries while the test is running.
     client = secondary.monger_client()
     try:
         client.admin.command(
             bson.SON([("configureFailPoint", "rsSyncApplyStop"), ("mode", "alwaysOn")]))
     except pymonger.errors.OperationFailure as err:
         self.logger.exception("Unable to disable oplog application on the mongerd on port %d",
                               secondary.port)
         raise errors.ServerFailure(
             "Unable to disable oplog application on the mongerd on port {}: {}".format(
                 secondary.port, err.args[0]))
コード例 #10
0
 def _disable_rssyncapplystop(self, secondary):
     # Disable the "rsSyncApplyStop" failpoint on the secondary to have it resume applying
     # oplog entries.
     client = secondary.monger_client()
     try:
         client.admin.command(
             bson.SON([("configureFailPoint", "rsSyncApplyStop"), ("mode", "off")]))
     except pymonger.errors.OperationFailure as err:
         self.logger.exception("Unable to re-enable oplog application on the mongerd on port %d",
                               secondary.port)
         raise errors.ServerFailure(
             "Unable to re-enable oplog application on the mongerd on port {}: {}".format(
                 secondary.port, err.args[0]))
コード例 #11
0
 def _await_secondary_state(self, secondary):
     client = secondary.monger_client()
     try:
         client.admin.command(
             bson.SON([
                 ("replSetTest", 1),
                 ("waitForMemberState", 2),  # 2 = SECONDARY
                 ("timeoutMillis", fixture.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60 * 1000)
             ]))
     except pymonger.errors.OperationFailure as err:
         self.logger.exception(
             "mongerd on port %d failed to reach state SECONDARY after %d seconds",
             secondary.port, fixture.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60)
         raise errors.ServerFailure(
             "mongerd on port {} failed to reach state SECONDARY after {} seconds: {}".format(
                 secondary.port, fixture.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60, err.args[0]))
コード例 #12
0
    def _restart_and_clear_fixture(self):
        # We restart the fixture after setting 'preserve_dbpath' back to its original value in order
        # to clear the contents of the data directory if desired. The CleanEveryN hook cannot be
        # used in combination with the PeriodicKillSecondaries hook because we may attempt to call
        # Fixture.teardown() while the "rsSyncApplyStop" failpoint is still enabled on the
        # secondaries, causing them to exit with a non-zero return code.
        self.logger.info("Finished verifying data consistency, stopping the fixture...")

        try:
            self.fixture.teardown()
        except errors.ServerFailure:
            raise errors.ServerFailure(
                "{} did not exit cleanly after verifying data consistency".format(self.fixture))

        self.logger.info("Starting the fixture back up again with no data...")
        self.fixture.setup()
        self.fixture.await_ready()
コード例 #13
0
    def _check_invariants_as_standalone(self, secondary):  # pylint: disable=too-many-locals
        # pylint: disable=too-many-branches,too-many-statements
        # We remove the --replSet option in order to start the node as a standalone.
        replset_name = secondary.mongerd_options.pop("replSet")
        self.logger.info(
            "Restarting the secondary on port %d as a standalone node with"
            " its data files intact...", secondary.port)

        try:
            secondary.setup()
            secondary.await_ready()

            client = secondary.monger_client()
            minvalid_doc = client.local["replset.minvalid"].find_one()
            oplog_truncate_after_doc = client.local["replset.oplogTruncateAfterPoint"].find_one()
            recovery_timestamp_res = client.admin.command("replSetTest",
                                                          getLastStableRecoveryTimestamp=True)
            latest_oplog_doc = client.local["oplog.rs"].find_one(sort=[("$natural",
                                                                        pymonger.DESCENDING)])

            self.logger.info("Checking invariants: minValid: {}, oplogTruncateAfterPoint: {},"
                             " stable recovery timestamp: {}, latest oplog doc: {}".format(
                                 minvalid_doc, oplog_truncate_after_doc, recovery_timestamp_res,
                                 latest_oplog_doc))

            null_ts = bson.Timestamp(0, 0)

            # We wait for a stable recovery timestamp at setup, so we must have an oplog.
            latest_oplog_entry_ts = null_ts
            if latest_oplog_doc is None:
                raise errors.ServerFailure("No latest oplog entry")
            latest_oplog_entry_ts = latest_oplog_doc.get("ts")
            if latest_oplog_entry_ts is None:
                raise errors.ServerFailure(
                    "Latest oplog entry had no 'ts' field: {}".format(latest_oplog_doc))

            # The "oplogTruncateAfterPoint" document may not exist at startup. If so, we default
            # it to null.
            oplog_truncate_after_ts = null_ts
            if oplog_truncate_after_doc is not None:
                oplog_truncate_after_ts = oplog_truncate_after_doc.get(
                    "oplogTruncateAfterPoint", null_ts)

            # The "lastStableRecoveryTimestamp" field is present if the storage engine supports
            # "recover to a timestamp". If it's a null timestamp on a durable storage engine, that
            # means we do not yet have a stable checkpoint timestamp and must be restarting at the
            # top of the oplog. Since we wait for a stable recovery timestamp at test fixture setup,
            # we should never encounter a null timestamp here.
            recovery_timestamp = recovery_timestamp_res.get("lastStableRecoveryTimestamp")
            if recovery_timestamp == null_ts:
                raise errors.ServerFailure(
                    "Received null stable recovery timestamp {}".format(recovery_timestamp_res))
            # On a storage engine that doesn't support "recover to a timestamp", we default to null.
            if recovery_timestamp is None:
                recovery_timestamp = null_ts

            # last stable recovery timestamp <= top of oplog
            if not recovery_timestamp <= latest_oplog_entry_ts:
                raise errors.ServerFailure("The condition last stable recovery timestamp <= top"
                                           " of oplog ({} <= {}) doesn't hold:"
                                           " getLastStableRecoveryTimestamp result={},"
                                           " latest oplog entry={}".format(
                                               recovery_timestamp, latest_oplog_entry_ts,
                                               recovery_timestamp_res, latest_oplog_doc))

            if minvalid_doc is not None:
                applied_through_ts = minvalid_doc.get("begin", {}).get("ts", null_ts)
                minvalid_ts = minvalid_doc.get("ts", null_ts)

                # The "appliedThrough" value should always equal the "last stable recovery
                # timestamp", AKA the stable checkpoint for durable engines, on server restart.
                #
                # The written "appliedThrough" time is updated with the latest timestamp at the end
                # of each batch application, and batch boundaries are the only valid stable
                # timestamps on secondaries. Therefore, a non-null appliedThrough timestamp must
                # equal the checkpoint timestamp, because any stable timestamp that the checkpoint
                # could use includes an equal persisted appliedThrough timestamp.
                if (recovery_timestamp != null_ts and applied_through_ts != null_ts
                        and (not recovery_timestamp == applied_through_ts)):
                    raise errors.ServerFailure(
                        "The condition last stable recovery timestamp ({}) == appliedThrough ({})"
                        " doesn't hold: minValid document={},"
                        " getLastStableRecoveryTimestamp result={}, last oplog entry={}".format(
                            recovery_timestamp, applied_through_ts, minvalid_doc,
                            recovery_timestamp_res, latest_oplog_doc))

                if applied_through_ts == null_ts:
                    # We clear "appliedThrough" to represent having applied through the top of the
                    # oplog in PRIMARY state or immediately after "rollback via refetch".
                    # If we are using a storage engine that supports "recover to a timestamp,"
                    # then we will have a "last stable recovery timestamp" and we should use that
                    # as our "appliedThrough" (similarly to why we assert their equality above).
                    # If both are null, then we are in PRIMARY state on a storage engine that does
                    # not support "recover to a timestamp" or in RECOVERING immediately after
                    # "rollback via refetch". Since we do not update "minValid" in PRIMARY state,
                    # we leave "appliedThrough" as null so that the invariants below hold, rather
                    # than substituting the latest oplog entry for the "appliedThrough" value.
                    applied_through_ts = recovery_timestamp

                if minvalid_ts == null_ts:
                    # The server treats the "ts" field in the minValid document as missing when its
                    # value is the null timestamp.
                    minvalid_ts = applied_through_ts

                if latest_oplog_entry_ts == null_ts:
                    # If the oplog is empty, we treat the "minValid" as the latest oplog entry.
                    latest_oplog_entry_ts = minvalid_ts

                if oplog_truncate_after_ts == null_ts:
                    # The server treats the "oplogTruncateAfterPoint" field as missing when its
                    # value is the null timestamp. When it is null, the oplog is complete and
                    # should not be truncated, so it is effectively the top of the oplog.
                    oplog_truncate_after_ts = latest_oplog_entry_ts

                # Check the ordering invariants before the secondary has reconciled the end of
                # its oplog.
                # The "oplogTruncateAfterPoint" is set to the first timestamp of each batch of
                # oplog entries before they are written to the oplog. Thus, it can be ahead
                # of the top of the oplog before any oplog entries are written, and behind it
                # after some are written. Thus, we cannot compare it to the top of the oplog.

                # appliedThrough <= minValid
                # appliedThrough represents the end of the previous batch, so it is always the
                # earliest.
                if applied_through_ts > minvalid_ts:
                    raise errors.ServerFailure(
                        "The condition appliedThrough <= minValid ({} <= {}) doesn't hold: minValid"
                        " document={}, latest oplog entry={}".format(
                            applied_through_ts, minvalid_ts, minvalid_doc, latest_oplog_doc))

                # minValid <= oplogTruncateAfterPoint
                # This is true because this hook is never run after a rollback. Thus, we only
                # move "minValid" to the end of each batch after the batch is written to the oplog.
                # We reset the "oplogTruncateAfterPoint" to null before we move "minValid" from
                # the end of the previous batch to the end of the current batch. Thus "minValid"
                # must be less than or equal to the "oplogTruncateAfterPoint".
                if minvalid_ts > oplog_truncate_after_ts:
                    raise errors.ServerFailure(
                        "The condition minValid <= oplogTruncateAfterPoint ({} <= {}) doesn't"
                        " hold: minValid document={}, oplogTruncateAfterPoint document={},"
                        " latest oplog entry={}".format(minvalid_ts, oplog_truncate_after_ts,
                                                        minvalid_doc, oplog_truncate_after_doc,
                                                        latest_oplog_doc))

                # minvalid <= latest oplog entry
                # "minValid" is set to the end of a batch after the batch is written to the oplog.
                # Thus it is always less than or equal to the top of the oplog.
                if minvalid_ts > latest_oplog_entry_ts:
                    raise errors.ServerFailure(
                        "The condition minValid <= top of oplog ({} <= {}) doesn't"
                        " hold: minValid document={}, latest oplog entry={}".format(
                            minvalid_ts, latest_oplog_entry_ts, minvalid_doc, latest_oplog_doc))

            try:
                secondary.teardown()
            except errors.ServerFailure:
                raise errors.ServerFailure(
                    "{} did not exit cleanly after being started up as a standalone".format(
                        secondary))
        except pymonger.errors.OperationFailure as err:
            self.logger.exception(
                "Failed to read the minValid document, the oplogTruncateAfterPoint document,"
                " the last stable recovery timestamp, or the latest oplog entry from the"
                " mongerd on port %d", secondary.port)
            raise errors.ServerFailure(
                "Failed to read the minValid document, the oplogTruncateAfterPoint document,"
                " the last stable recovery timestamp, or the latest oplog entry from the"
                " mongerd on port {}: {}".format(secondary.port, err.args[0]))
        finally:
            # Set the secondary's options back to their original values.
            secondary.mongerd_options["replSet"] = replset_name
コード例 #14
0
ファイル: stepdown.py プロジェクト: tx2z/monger
    def _step_down(self, rs_fixture):
        try:
            primary = rs_fixture.get_primary(
                timeout_secs=self._stepdown_interval_secs)
        except errors.ServerFailure:
            # We ignore the ServerFailure exception because it means a primary wasn't available.
            # We'll try again after self._stepdown_interval_secs seconds.
            return

        secondaries = rs_fixture.get_secondaries()

        # Check that the fixture is still running before stepping down or killing the primary.
        # This ensures we still detect some cases in which the fixture has already crashed.
        if not rs_fixture.is_running():
            raise errors.ServerFailure(
                "ReplicaSetFixture {} expected to be running in"
                " ContinuousStepdown, but wasn't.".format(
                    rs_fixture.replset_name))

        if self._terminate:
            should_kill = self._kill and random.choice([True, False])
            action = "Killing" if should_kill else "Terminating"
            self.logger.info("%s the primary on port %d of replica set '%s'.",
                             action, primary.port, rs_fixture.replset_name)

            # We send the mongerd process the signal to exit but don't immediately wait for it to
            # exit because clean shutdown may take a while and we want to restore write availability
            # as quickly as possible.
            primary.mongerd.stop(kill=should_kill)
        elif not self._stepdown_via_heartbeats:
            self.logger.info(
                "Stepping down the primary on port %d of replica set '%s'.",
                primary.port, rs_fixture.replset_name)
            try:
                client = primary.monger_client()
                client.admin.command(
                    bson.SON([
                        ("replSetStepDown", self._stepdown_duration_secs),
                        ("force", True),
                    ]))
            except pymonger.errors.AutoReconnect:
                # AutoReconnect exceptions are expected as connections are closed during stepdown.
                pass
            except pymonger.errors.PyMongerError:
                self.logger.exception(
                    "Error while stepping down the primary on port %d of replica set '%s'.",
                    primary.port, rs_fixture.replset_name)
                raise

        # We have skipped stepping down the primary if we want to step up secondaries instead. Here,
        # we simply need to pick an arbitrary secondary to run for election which will lead to
        # unconditional stepdown on the primary.
        #
        # If we have terminated/killed/stepped down the primary above, write availability has lost.
        # We pick an arbitrary secondary to run for election immediately in order to avoid a long
        # period where the replica set doesn't have write availability. If none of the secondaries
        # are eligible, or their election attempt fails, then we'll run the replSetStepUp command on
        # 'primary' to ensure we have write availability sooner than the
        # self._stepdown_duration_secs duration expires.
        while secondaries:
            chosen = random.choice(secondaries)

            self.logger.info(
                "Attempting to step up the secondary on port %d of replica set '%s'.",
                chosen.port, rs_fixture.replset_name)

            try:
                client = chosen.monger_client()
                client.admin.command("replSetStepUp")
                break
            except pymonger.errors.OperationFailure:
                # OperationFailure exceptions are expected when the election attempt fails due to
                # not receiving enough votes. This can happen when the 'chosen' secondary's opTime
                # is behind that of other secondaries. We handle this by attempting to elect a
                # different secondary.
                self.logger.info(
                    "Failed to step up the secondary on port %d of replica set '%s'.",
                    chosen.port, rs_fixture.replset_name)
                secondaries.remove(chosen)

        if self._terminate:
            self.logger.info(
                "Waiting for the old primary on port %d of replica set '%s' to exit.",
                primary.port, rs_fixture.replset_name)

            primary.mongerd.wait()

            self.logger.info(
                "Attempting to restart the old primary on port %d of replica set '%s.",
                primary.port, rs_fixture.replset_name)

            # Restart the mongerd on the old primary and wait until we can contact it again. Keep the
            # original preserve_dbpath to restore after restarting the mongerd.
            original_preserve_dbpath = primary.preserve_dbpath
            primary.preserve_dbpath = True
            try:
                primary.setup()
                primary.await_ready()
            finally:
                primary.preserve_dbpath = original_preserve_dbpath
        elif not self._stepdown_via_heartbeats:
            # If we chose to step up a secondary instead, the primary was never stepped down via the
            # replSetStepDown command and thus will not have a stepdown period. So we can skip
            # running {replSetFreeze: 0}. Otherwise, the replSetStepDown command run earlier
            # introduced a stepdown period on the former primary and so we have to run the
            # {replSetFreeze: 0} command to ensure the former primary is electable in the next
            # round of _step_down().
            client = primary.monger_client()
            client.admin.command({"replSetFreeze": 0})
        elif secondaries:
            # We successfully stepped up a secondary, wait for the former primary to step down via
            # heartbeats. We need to wait for the former primary to step down to complete this step
            # down round and to avoid races between the ContinuousStepdown hook and other test hooks
            # that may depend on the health of the replica set.
            self.logger.info(
                "Successfully stepped up the secondary on port %d of replica set '%s'.",
                chosen.port, rs_fixture.replset_name)
            while True:
                try:
                    client = primary.monger_client()
                    is_secondary = client.admin.command(
                        "isMaster")["secondary"]
                    if is_secondary:
                        break
                except pymonger.errors.AutoReconnect:
                    pass
                self.logger.info(
                    "Waiting for primary on port %d of replica set '%s' to step down.",
                    primary.port, rs_fixture.replset_name)
                time.sleep(0.2)  # Wait a little bit before trying again.
            self.logger.info(
                "Primary on port %d of replica set '%s' stepped down.",
                primary.port, rs_fixture.replset_name)

        if not secondaries:
            # If we failed to step up one of the secondaries, then we run the replSetStepUp to try
            # and elect the former primary again. This way we don't need to wait
            # self._stepdown_duration_secs seconds to restore write availability to the cluster.
            # Since the former primary may have been killed, we need to wait until it has been
            # restarted by retrying replSetStepUp.
            retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
            retry_start_time = time.time()
            while True:
                try:
                    client = primary.monger_client()
                    client.admin.command("replSetStepUp")
                    break
                except pymonger.errors.OperationFailure:
                    self._wait(0.2)
                if time.time() - retry_start_time > retry_time_secs:
                    raise errors.ServerFailure(
                        "The old primary on port {} of replica set {} did not step up in"
                        " {} seconds.".format(client.port,
                                              rs_fixture.replset_name,
                                              retry_time_secs))

        # Bump the counter for the chosen secondary to indicate that the replSetStepUp command
        # executed successfully.
        key = "{}/{}".format(
            rs_fixture.replset_name,
            chosen.get_internal_connection_string() if secondaries else "none")
        self._step_up_stats[key] += 1
コード例 #15
0
ファイル: stepdown.py プロジェクト: tx2z/monger
 def _check_thread(self):
     if not self._stepdown_thread.is_alive():
         msg = "The stepdown thread is not running."
         self.logger.error(msg)
         raise errors.ServerFailure(msg)