def test_mark_failed(self, client):
        """Create a plugin not in the cluster and try to mark it as failed.
        Nothing should be written to etcd."""
        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        e.mark_node_failed()
        e._client.write.assert_not_called()

        e.terminate()
    def test_mark_failed(self, client):
        """Create a plugin not in the cluster and try to mark it as failed.
        Nothing should be written to etcd."""
        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        e.mark_node_failed()
        e._client.write.assert_not_called()

        e.terminate()
    def test_leaving(self, client):
        """Create a plugin not in the cluster and try to leave the cluster.
        Nothing should be written to etcd."""
        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        e.leave_cluster()
        e._client.write.assert_not_called()

        e.terminate()
    def test_leaving(self, client):
        """Create a plugin not in the cluster and try to leave the cluster.
        Nothing should be written to etcd."""
        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        e.leave_cluster()
        e._client.write.assert_not_called()

        e.terminate()
    def test_watcher(self):
        """Create a new 3-node cluster with one plugin not in the cluster and
        check that the main three all end up in NORMAL state"""

        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        self.make_and_start_synchronizers(3)
        mock_client = self.syncs[0]._client
        self.wait_for_all_normal(mock_client, required_number=3)

        # Pause for one second - the watcher plugin might be called just after
        # all other nodes enter 'normal' state
        sleep(1)
        self.assertTrue(self.plugin.on_stable_cluster_called)

        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.0"))
        self.assertEqual("normal", end.get("10.0.0.1"))
        self.assertEqual("normal", end.get("10.0.0.2"))
        self.assertEqual(None, end.get("10.1.1.1"))

        e.terminate()
    def test_watcher(self):
        """Create a new 3-node cluster with one plugin not in the cluster and
        check that the main three all end up in NORMAL state"""

        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        self.make_and_start_synchronizers(3)
        mock_client = self.syncs[0]._client
        self.wait_for_all_normal(mock_client, required_number=3)

        # Pause for one second - the watcher plugin might be called just after
        # all other nodes enter 'normal' state
        sleep(1)
        self.assertTrue(self.plugin.on_stable_cluster_called)

        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.0"))
        self.assertEqual("normal", end.get("10.0.0.1"))
        self.assertEqual("normal", end.get("10.0.0.2"))
        self.assertEqual(None, end.get("10.1.1.1"))

        e.terminate()
Exemple #7
0
    def test_failure(self):

        # Create synchronisers, using a FailPlugin for one which will crash and
        # not complete (simulating a failed node)
        sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.1')
        sync2 = EtcdSynchronizer(FailPlugin(None), '10.0.0.2')
        sync3 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.3')
        mock_client = sync1._client
        for s in [sync1, sync2, sync3]:
            s.start_thread()

        # After a few seconds, the scale-up will still not have completed
        sleep(3)
        end = json.loads(mock_client.read("/test").value)
        self.assertNotEqual("normal", end.get("10.0.0.1"))
        self.assertNotEqual("normal", end.get("10.0.0.2"))
        self.assertNotEqual("normal", end.get("10.0.0.3"))

        # Start a synchroniser to take 10.0.0.2's place
        sync2.terminate()
        error_syncer = EtcdSynchronizer(NullPlugin('/test'),
                                        '10.0.0.2',
                                        force_leave=True)
        error_syncer.mark_node_failed()
        error_syncer.leave_cluster()
        error_syncer.start_thread()

        # 10.0.0.2 will be removed from the cluster, and the cluster will
        # stabilise
        self.wait_for_all_normal(mock_client, required_number=2, tries=50)
        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.1"))
        self.assertEqual("normal", end.get("10.0.0.3"))
        self.assertEqual(None, end.get("10.0.0.2"))
        for s in [sync1, sync3, error_syncer]:
            s.terminate()
    def test_failure(self):

        # Create synchronisers, using a FailPlugin for one which will crash and
        # not complete (simulating a failed node)
        sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.1')
        sync2 = EtcdSynchronizer(FailPlugin(None), '10.0.0.2')
        sync3 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.3')
        mock_client = sync1._client
        for s in [sync1, sync2, sync3]:
            s.start_thread()

        # After a few seconds, the scale-up will still not have completed
        sleep(3)
        end = json.loads(mock_client.read("/test").value)
        self.assertNotEqual("normal", end.get("10.0.0.1"))
        self.assertNotEqual("normal", end.get("10.0.0.2"))
        self.assertNotEqual("normal", end.get("10.0.0.3"))

        # Start a synchroniser to take 10.0.0.2's place
        sync2.terminate()
        error_syncer = EtcdSynchronizer(NullPlugin('/test'),
                                        '10.0.0.2',
                                        force_leave=True)
        error_syncer.mark_node_failed()
        error_syncer.leave_cluster()
        error_syncer.start_thread()

        # 10.0.0.2 will be removed from the cluster, and the cluster will
        # stabilise
        self.wait_for_all_normal(mock_client, required_number=2, tries=50)
        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.1"))
        self.assertEqual("normal", end.get("10.0.0.3"))
        self.assertEqual(None, end.get("10.0.0.2"))
        for s in [sync1, sync3, error_syncer]:
            s.terminate()
def main(args):
    syslog.openlog("cluster-manager", syslog.LOG_PID)
    pdlogs.STARTUP.log()
    try:
        arguments = docopt(__doc__, argv=args)
    except DocoptExit:
        pdlogs.EXITING_BAD_CONFIG.log()
        raise

    mgmt_ip = arguments['--mgmt-local-ip']
    sig_ip = arguments['--sig-local-ip']
    local_site_name = arguments['--local-site']
    remote_site_name = arguments['--remote-site']
    remote_cassandra_seeds = arguments['--remote-cassandra-seeds']
    if remote_cassandra_seeds:
        remote_cassandra_seeds = remote_cassandra_seeds.split(',')
    else:
        remote_cassandra_seeds = []
    signaling_namespace = arguments.get('--signaling-namespace')
    local_uuid = UUID(arguments['--uuid'])
    etcd_key = arguments.get('--etcd-key')
    etcd_cluster_key = arguments.get('--etcd-cluster-key')
    cluster_manager_enabled = arguments['--cluster-manager-enabled']
    log_dir = arguments['--log-directory']
    log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG)

    stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log")

    # Check that there's an etcd_cluster_key value passed to the cluster
    # manager
    if etcd_cluster_key == "":
        # The etcd_cluster_key isn't valid, and possibly get weird entries in
        # the etcd database if we allow the cluster_manager to start
        pdlogs.EXITING_MISSING_ETCD_CLUSTER_KEY.log()
        exit(1)

    if not arguments['--foreground']:
        utils.daemonize(stdout_err_log)

    # Process names are limited to 15 characters, so abbreviate
    prctl.prctl(prctl.NAME, "cw-cluster-mgr")

    logging_config.configure_logging(log_level, log_dir, "cluster-manager", show_thread=True)

    # urllib3 logs a WARNING log whenever it recreates a connection, but our
    # etcd usage does this frequently (to allow watch timeouts), so deliberately
    # ignore this log
    urllib_logger = logging.getLogger('urllib3')
    urllib_logger.setLevel(logging.ERROR)

    utils.install_sigusr1_handler("cluster-manager")

    # Drop a pidfile. We must keep a reference to the file object here, as this keeps
    # the file locked and provides extra protection against two processes running at
    # once.
    pidfile_lock = None
    try:
        pidfile_lock = utils.lock_and_write_pid_file(arguments['--pidfile']) # noqa
    except IOError:
        # We failed to take the lock - another process is already running
        exit(1)

    plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/"
    plugins = load_plugins_in_dir(plugins_dir,
                                  PluginParams(ip=sig_ip,
                                               mgmt_ip=mgmt_ip,
                                               local_site=local_site_name,
                                               remote_site=remote_site_name,
                                               remote_cassandra_seeds=remote_cassandra_seeds,
                                               signaling_namespace=signaling_namespace,
                                               uuid=local_uuid,
                                               etcd_key=etcd_key,
                                               etcd_cluster_key=etcd_cluster_key))
    plugins.sort(key=lambda x: x.key())
    plugins_to_use = []
    files = []
    skip = False
    for plugin in plugins:
        for plugin_file in plugin.files():
            if plugin_file in files:
                _log.info("Skipping plugin {} because {} "
                          "is already managed by another plugin"
                          .format(plugin, plugin_file))
                skip = True

        if not skip:
            plugins_to_use.append(plugin)
            files.extend(plugin.files())

    synchronizers = []
    threads = []

    if cluster_manager_enabled == "N":
        # Don't start any threads as we don't want the cluster manager to run
        pdlogs.DO_NOT_START.log()
    elif etcd_cluster_key == "DO_NOT_CLUSTER":
        # Don't start any threads as we don't want this box to cluster
        pdlogs.DO_NOT_CLUSTER.log()
    else:
        for plugin in plugins_to_use:
            syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip)
            syncer.start_thread()

            synchronizers.append(syncer)
            threads.append(syncer.thread)
            _log.info("Loaded plugin %s" % plugin)


    install_sigquit_handler(synchronizers)
    utils.install_sigterm_handler(synchronizers)

    while any([thread.isAlive() for thread in threads]):
        for thread in threads:
            if thread.isAlive():
                thread.join(1)

    _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT")
    while not utils.should_quit and not should_quit:
        sleep(1)
    _log.info("Quitting")
    _log.debug("%d threads outstanding at exit" % activeCount())
    pdlogs.EXITING.log()
    syslog.closelog()
                    level=logging.DEBUG)

local_ip = sys.argv[1]
site = sys.argv[2]
node_type = sys.argv[3]
datastore = sys.argv[4]
dead_node_ip = sys.argv[5]

key = make_key(site, node_type, datastore)
logging.info("Using etcd key %s" % (key))

error_syncer = EtcdSynchronizer(NullPlugin(key), dead_node_ip, etcd_ip=local_ip, force_leave=True)

print "Marking node as failed and removing it from the cluster - will take at least 30 seconds"
# Move the dead node into ERROR state to allow in-progress operations to
# complete
error_syncer.mark_node_failed()

# Move the dead node out of the cluster
error_syncer.start_thread()
error_syncer.leave_cluster()

# Wait for it to leave
error_syncer.thread.join()
print "Process complete - %s has left the cluster" % dead_node_ip

c = etcd.Client(local_ip, 4000)
new_state = c.get(key).value

logging.info("New etcd state (after removing %s) is %s" % (dead_node_ip, new_state))
Exemple #11
0
def main(args):
    syslog.openlog("cluster-manager", syslog.LOG_PID)
    pdlogs.STARTUP.log()
    try:
        arguments = docopt(__doc__, argv=args)
    except DocoptExit:
        pdlogs.EXITING_BAD_CONFIG.log()
        raise

    mgmt_ip = arguments['--mgmt-local-ip']
    sig_ip = arguments['--sig-local-ip']
    local_site_name = arguments['--local-site']
    remote_site_name = arguments['--remote-site']
    remote_cassandra_seeds = arguments['--remote-cassandra-seeds']
    if remote_cassandra_seeds:
        remote_cassandra_seeds = remote_cassandra_seeds.split(',')
    else:
        remote_cassandra_seeds = []
    signaling_namespace = arguments.get('--signaling-namespace')
    local_uuid = UUID(arguments['--uuid'])
    etcd_key = arguments.get('--etcd-key')
    etcd_cluster_key = arguments.get('--etcd-cluster-key')
    cluster_manager_enabled = arguments['--cluster-manager-enabled']
    log_dir = arguments['--log-directory']
    log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG)

    stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log")

    # Check that there's an etcd_cluster_key value passed to the cluster
    # manager
    if etcd_cluster_key == "":
        # The etcd_cluster_key isn't valid, and possibly get weird entries in
        # the etcd database if we allow the cluster_manager to start
        pdlogs.EXITING_MISSING_ETCD_CLUSTER_KEY.log()
        exit(1)

    if not arguments['--foreground']:
        utils.daemonize(stdout_err_log)

    # Process names are limited to 15 characters, so abbreviate
    prctl.prctl(prctl.NAME, "cw-cluster-mgr")

    logging_config.configure_logging(log_level,
                                     log_dir,
                                     "cluster-manager",
                                     show_thread=True)

    # urllib3 logs a WARNING log whenever it recreates a connection, but our
    # etcd usage does this frequently (to allow watch timeouts), so deliberately
    # ignore this log
    urllib_logger = logging.getLogger('urllib3')
    urllib_logger.setLevel(logging.ERROR)

    utils.install_sigusr1_handler("cluster-manager")

    # Drop a pidfile. We must keep a reference to the file object here, as this keeps
    # the file locked and provides extra protection against two processes running at
    # once.
    pidfile_lock = None
    try:
        pidfile_lock = utils.lock_and_write_pid_file(
            arguments['--pidfile'])  # noqa
    except IOError:
        # We failed to take the lock - another process is already running
        exit(1)

    plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/"
    plugins = load_plugins_in_dir(
        plugins_dir,
        PluginParams(ip=sig_ip,
                     mgmt_ip=mgmt_ip,
                     local_site=local_site_name,
                     remote_site=remote_site_name,
                     remote_cassandra_seeds=remote_cassandra_seeds,
                     signaling_namespace=signaling_namespace,
                     uuid=local_uuid,
                     etcd_key=etcd_key,
                     etcd_cluster_key=etcd_cluster_key))
    plugins.sort(key=lambda x: x.key())
    plugins_to_use = []
    files = []
    skip = False
    for plugin in plugins:
        for plugin_file in plugin.files():
            if plugin_file in files:
                _log.info("Skipping plugin {} because {} "
                          "is already managed by another plugin".format(
                              plugin, plugin_file))
                skip = True

        if not skip:
            plugins_to_use.append(plugin)
            files.extend(plugin.files())

    synchronizers = []
    threads = []

    if cluster_manager_enabled == "N":
        # Don't start any threads as we don't want the cluster manager to run
        pdlogs.DO_NOT_START.log()
    elif etcd_cluster_key == "DO_NOT_CLUSTER":
        # Don't start any threads as we don't want this box to cluster
        pdlogs.DO_NOT_CLUSTER.log()
    else:
        for plugin in plugins_to_use:
            syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip)
            syncer.start_thread()

            synchronizers.append(syncer)
            threads.append(syncer.thread)
            _log.info("Loaded plugin %s" % plugin)

    install_sigquit_handler(synchronizers)
    install_sigterm_handler(synchronizers)

    while any([thread.isAlive() for thread in threads]):
        for thread in threads:
            if thread.isAlive():
                thread.join(1)

    _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT")
    while not should_quit:
        sleep(1)
    _log.info("Quitting")
    _log.debug("%d threads outstanding at exit" % activeCount())
    pdlogs.EXITING.log()
    syslog.closelog()
Exemple #12
0
def main(args):
    syslog.openlog("cluster-manager", syslog.LOG_PID)
    pdlogs.STARTUP.log()
    try:
        arguments = docopt(__doc__, argv=args)
    except DocoptExit:
        pdlogs.EXITING_BAD_CONFIG.log()
        raise

    mgmt_ip = arguments['--mgmt-local-ip']
    sig_ip = arguments['--sig-local-ip']
    local_site_name = arguments['--local-site']
    remote_site_name = arguments['--remote-site']
    signaling_namespace = arguments.get('--signaling-namespace')
    log_dir = arguments['--log-directory']
    log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG)

    stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log")

    if not arguments['--foreground']:
        utils.daemonize(stdout_err_log)

    logging_config.configure_logging(log_level, log_dir, "cluster-manager", show_thread=True)

    # urllib3 logs a WARNING log whenever it recreates a connection, but our
    # etcd usage does this frequently (to allow watch timeouts), so deliberately
    # ignore this log
    urllib_logger = logging.getLogger('urllib3')
    urllib_logger.setLevel(logging.ERROR)

    utils.install_sigusr1_handler("cluster-manager")

    # Drop a pidfile.
    pid = os.getpid()
    with open(arguments['--pidfile'], "w") as pidfile:
        pidfile.write(str(pid) + "\n")

    plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/"
    plugins = load_plugins_in_dir(plugins_dir,
                                  PluginParams(ip=sig_ip,
                                               mgmt_ip=mgmt_ip,
                                               local_site=local_site_name,
                                               remote_site=remote_site_name,
                                               signaling_namespace=signaling_namespace))
    plugins.sort(key=lambda x: x.key())
    plugins_to_use = []
    files = []
    skip = False
    for plugin in plugins:
        for plugin_file in plugin.files():
            if plugin_file in files:
                _log.info("Skipping plugin {} because {} "
                          "is already managed by another plugin"
                          .format(plugin, plugin_file))
                skip = True

        if not skip:
            plugins_to_use.append(plugin)
            files.extend(plugin.files())

    synchronizers = []
    threads = []
    for plugin in plugins_to_use:
        syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip)
        syncer.start_thread()

        synchronizers.append(syncer)
        threads.append(syncer.thread)
        _log.info("Loaded plugin %s" % plugin)

    install_sigquit_handler(synchronizers)
    install_sigterm_handler(synchronizers)

    while any([thread.isAlive() for thread in threads]):
        for thread in threads:
            if thread.isAlive():
                thread.join(1)

    _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT")
    while not should_quit:
        sleep(1)
    _log.info("Quitting")
    _log.debug("%d threads outstanding at exit" % activeCount())
    pdlogs.EXITING.log()
    syslog.closelog()
    # Use os.exit to skip exit handlers - otherwise the concurrent.futures exit
    # handler waits for an infinite wait to end
    os._exit(0)
                                        etcd_ip=local_ip,
                                        force_leave=True)
    except ImportError:
        print "You must run mark_node_failed on a node that has Cassandra installed to remove a node from a Cassandra cluster"
        sys.exit(1)
else:
    error_syncer = EtcdSynchronizer(NullPlugin(key),
                                    dead_node_ip,
                                    etcd_ip=local_ip,
                                    force_leave=True)

print "Marking node as failed and removing it from the cluster - will take at least 30 seconds"
# Move the dead node into ERROR state to allow in-progress operations to
# complete
error_syncer.mark_node_failed()

# Move the dead node out of the cluster
error_syncer.start_thread()
error_syncer.leave_cluster()

# Wait for it to leave
error_syncer.thread.join()

print "Process complete - %s has left the cluster" % dead_node_ip

c = etcd.Client(local_ip, 4000)
new_state = c.get(key).value

logging.info("New etcd state (after removing %s) is %s" %
             (dead_node_ip, new_state))