Example #1
0
 def __init__(self, cluster):
     # A lot of things are wrong in that method. It assumes that the ip
     # 127.0.0.<nbnode> is free and use standard ports without asking.
     # It should problably be fixed, but will be good enough for now.
     addr = "127.0.0.%d" % (len(cluster.nodes) + 1)
     self.path = tempfile.mkdtemp(prefix="bulkloader-")
     Node.__init__(self, "bulkloader", cluster, False, (addr, 9160), (addr, 7000), str(9042), 2000, None)
Example #2
0
 def copy_config_files(self):
     Node.copy_config_files(self)
     conf_pattern = os.path.join(self.get_tools_java_dir(), 'conf',
                                 "jvm*.options")
     for filename in glob.glob(conf_pattern):
         if os.path.isfile(filename):
             shutil.copy(filename, self.get_conf_dir())
Example #3
0
    def test_upgrade_legacy_table(self):
        """
        Upgrade with bringing up the legacy tables after the newer nodes (without legacy tables)
        were started.

        @jira_ticket CASSANDRA-12813
        """

        cluster = self.cluster

        # Forcing cluster version on purpose
        cluster.set_install_dir(version="2.1.16")
        cluster.populate(3).start()

        node1, node2, node3 = cluster.nodelist()

        # Wait for default user to get created on one of the nodes
        time.sleep(15)

        # Upgrade to current version
        for node in [node1, node2, node3]:
            node.drain()
            node.watch_log_for("DRAINED")
            node.stop(gently=True)
            self.set_node_to_current_version(node)

        cluster.start()

        # Make sure the system_auth table will get replicated to the node that we're going to replace
        session = self.patient_cql_connection(node1, user='******', password='******')
        session.execute("ALTER KEYSPACE system_auth WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 };")
        cluster.repair()

        cluster.stop()

        # Replace the node
        cluster.seeds.remove(node1)
        cluster.remove(node1)

        replacement_address = node1.address()
        replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                thrift_interface=(replacement_address, 9160), storage_interface=(replacement_address, 7000),
                                jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042))
        self.set_node_to_current_version(replacement_node)

        cluster.add(replacement_node, True)
        replacement_node.start(wait_for_binary_proto=True)

        node2.start(wait_for_binary_proto=True)
        node3.start(wait_for_binary_proto=True)

        replacement_node.watch_log_for('Initializing system_auth.credentials')
        replacement_node.watch_log_for('Initializing system_auth.permissions')
        replacement_node.watch_log_for('Initializing system_auth.users')

        cluster.repair()
        replacement_node.watch_log_for('Repair command')

        # Should succeed. Will throw an NPE on pre-12813 code.
        self.patient_cql_connection(replacement_node, user='******', password='******')
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1,node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4',9042))
        cluster.add(node4, False)

        #try to replace an unassigned ip address
        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.5', wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1,node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4',9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log("java.lang.UnsupportedOperationException: Cannot replace a live node...")
        self.assertEqual(len(checkError), 1)
Example #6
0
    def decommission_node_test(self):
        debug("decommission_node_test()")
        cluster = self.cluster

        cluster.populate(1)
        # create and add a new node, I must not be a seed, otherwise
        # we get schema disagreement issues for awhile after decommissioning it.
        node2 = Node('node2', cluster, True, ('127.0.0.2', 9160),
                     ('127.0.0.2', 7000), '7200', '0', None)
        cluster.add(node2, False)

        [node1, node2] = cluster.nodelist()
        node1.start()
        node2.start()
        wait(2)

        cursor = self.cql_connection(node1).cursor()
        self.prepare_for_changes(cursor)

        node2.decommission()
        wait(30)

        self.validate_schema_consistent(node1)
        self.make_schema_changes(cursor, namespace='ns1')

        # create and add a new node
        node3 = Node('node3', cluster, True, ('127.0.0.3', 9160),
                     ('127.0.0.3', 7000), '7300', '0', None)

        cluster.add(node3, True)
        node3.start()

        wait(30)
        self.validate_schema_consistent(node1)
Example #7
0
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(
                ['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(
                ['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version(
        ) >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        cluster.add(node4, False)

        #try to replace an unassigned ip address
        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.5',
                            wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")
    def _do_replace(self, same_address=False, jvm_option='replace_address',
                    wait_other_notice=False, wait_for_binary_proto=True,
                    replace_address=None, opts=None, data_center=None,
                    extra_jvm_args=None):
        if replace_address is None:
            replace_address = self.replaced_node.address()

        # only create node if it's not yet created
        if self.replacement_node is None:
            replacement_address = '127.0.0.4'
            if same_address:
                replacement_address = self.replaced_node.address()
                self.cluster.remove(self.replaced_node)

            logger.debug("Starting replacement node {} with jvm_option '{}={}'".format(replacement_address, jvm_option, replace_address))
            self.replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                         thrift_interface=None, storage_interface=(replacement_address, 7000),
                                         jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042))
            if opts is not None:
                logger.debug("Setting options on replacement node: {}".format(opts))
                self.replacement_node.set_configuration_options(opts)
            self.cluster.add(self.replacement_node, False, data_center=data_center)

        if extra_jvm_args is None:
            extra_jvm_args = []
        extra_jvm_args.extend(["-Dcassandra.{}={}".format(jvm_option, replace_address),
                               "-Dcassandra.ring_delay_ms=10000",
                               "-Dcassandra.broadcast_interval_ms=10000"])

        self.replacement_node.start(jvm_args=extra_jvm_args,
                                    wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice)

        if self.cluster.cassandra_version() >= '2.2.8' and same_address:
            self.replacement_node.watch_log_for("Writes will not be forwarded to this node during replacement",
                                                timeout=60)
    def _test_disk_balance_replace(self, same_address):
        logger.debug("Creating cluster")
        cluster = self.cluster
        if self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 256})
        # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py)
        self.fixture_dtest_setup.allow_log_errors = True
        cluster.populate(4).start(wait_for_binary_proto=True)
        node1 = cluster.nodes['node1']

        logger.debug("Populating")
        node1.stress(['write', 'n=50k', 'no-warmup', '-rate', 'threads=100', '-schema', 'replication(factor=3)', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)'])
        cluster.flush()

        logger.debug("Stopping and removing node2")
        node2 = cluster.nodes['node2']
        node2.stop(gently=False)
        self.cluster.remove(node2)

        node5_address = node2.address() if same_address else '127.0.0.5'
        logger.debug("Starting replacement node")
        node5 = Node('node5', cluster=self.cluster, auto_bootstrap=True,
                     thrift_interface=None, storage_interface=(node5_address, 7000),
                     jmx_port='7500', remote_debug_port='0', initial_token=None,
                     binary_interface=(node5_address, 9042))
        self.cluster.add(node5, False)
        node5.start(jvm_args=["-Dcassandra.replace_address_first_boot={}".format(node2.address())],
                    wait_for_binary_proto=True,
                    wait_other_notice=True)

        logger.debug("Checking replacement node is balanced")
        self.assert_balanced(node5)
Example #10
0
    def test_replace(self):
        main_session = self.patient_cql_connection(self.node1)

        #We want the node being replaced to have no data on it so the replacement definitely fetches all the data
        self.node2.stop(wait_other_notice=True)

        for i in range(0, 40):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        replacement_address = self.node2.address()
        self.node2.stop(wait_other_notice=True)
        self.cluster.remove(self.node2)
        self.node2 = Node('replacement',
                          cluster=self.cluster,
                          auto_bootstrap=True,
                          thrift_interface=None,
                          storage_interface=(replacement_address, 7000),
                          jmx_port='7400',
                          remote_debug_port='0',
                          initial_token=None,
                          binary_interface=(replacement_address, 9042))
        patch_start(self.node2)
        nodes = [self.node1, self.node2, self.node3]
        self.cluster.add(self.node2, False, data_center='datacenter1')
        jvm_args = [
            "-Dcassandra.replace_address=%s" % replacement_address,
            "-Dcassandra.ring_delay_ms=10000",
            "-Dcassandra.broadcast_interval_ms=10000"
        ]
        self.node2.start(jvm_args=jvm_args,
                         wait_for_binary_proto=True,
                         wait_other_notice=True)

        sessions = [
            self.exclusive_cql_connection(node)
            for node in [self.node1, self.node2, self.node3]
        ]

        # Everyone should have everything
        expected = [
            gen_expected(range(0, 40)),
            gen_expected(range(0, 40)),
            gen_expected(range(0, 40))
        ]

        self.check_replication(sessions, exactly=3)
        self.check_expected(sessions, expected)

        repair_nodes(nodes)
        cleanup_nodes(nodes)

        self.check_replication(sessions, exactly=2)

        expected = [
            gen_expected(range(0, 11), range(21, 40)),
            gen_expected(range(0, 21), range(31, 40)),
            gen_expected(range(11, 31))
        ]
        self.check_expected(sessions, expected)
Example #11
0
 def __init__(self, cluster):
     # A lot of things are wrong in that method. It assumes that the ip
     # 127.0.0.<nbnode> is free and use standard ports without asking.
     # It should problably be fixed, but will be good enough for now.
     addr = '127.0.0.%d' % (len(cluster.nodes) + 1)
     self.path = tempfile.mkdtemp(prefix='bulkloader-')
     Node.__init__(self, 'bulkloader', cluster, False, (addr, 9160), (addr, 7000), str(9042), 2000, None)
    def multiple_repair_test(self):
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_cql_connection(node1)
        self.create_ks(session, 'ks', 3)
        self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'})

        debug("insert data")

        insert_c1c2(session, keys=range(1, 50), consistency=ConsistencyLevel.ALL)
        node1.flush()

        debug("bringing down node 3")
        node3.flush()
        node3.stop(gently=False)

        debug("inserting additional data into node 1 and 2")
        insert_c1c2(session, keys=range(50, 100), consistency=ConsistencyLevel.TWO)
        node1.flush()
        node2.flush()

        debug("restarting and repairing node 3")
        node3.start(wait_for_binary_proto=True)

        if cluster.version() >= "2.2":
            node3.repair()
        else:
            node3.nodetool("repair -par -inc")

        # wait stream handlers to be closed on windows
        # after session is finished (See CASSANDRA-10644)
        if is_win:
            time.sleep(2)

        debug("stopping node 2")
        node2.stop(gently=False)

        debug("inserting data in nodes 1 and 3")
        insert_c1c2(session, keys=range(100, 150), consistency=ConsistencyLevel.TWO)
        node1.flush()
        node3.flush()

        debug("start and repair node 2")
        node2.start(wait_for_binary_proto=True)

        if cluster.version() >= "2.2":
            node2.repair()
        else:
            node2.nodetool("repair -par -inc")

        debug("replace node and check data integrity")
        node3.stop(gently=False)
        node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5', 9042))
        cluster.add(node5, False)
        node5.start(replace_address='127.0.0.3', wait_other_notice=True)

        assert_one(session, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
Example #13
0
    def test_rf_gt_nodes_multidc_should_succeed(self):
        """
        Validating a KS with RF > N on multi DC doesn't break bootstrap
        @jira_ticket CASSANDRA-16296 CASSANDRA-16411
        """
        cluster = self.cluster
        cluster.set_environment_variable(
            'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate([1, 1])
        cluster.start()

        node1 = cluster.nodelist()[0]
        node2 = cluster.nodelist()[1]
        session = self.patient_exclusive_cql_connection(node1)
        session.execute(
            "CREATE KEYSPACE k WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : '3'}"
        )

        if cluster.version() >= '4.0':
            warning = 'Your replication factor 3 for keyspace k is higher than the number of nodes 1 for datacenter dc1'
            assert len(node1.grep_log(warning)) == 1
            assert len(node2.grep_log(warning)) == 0

        session.execute(
            "ALTER KEYSPACE k WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : '2'}"
        )
        session.execute(
            "CREATE TABLE k.testgtrfmultidc (KEY text PRIMARY KEY)")
        session.execute(
            "INSERT INTO k.testgtrfmultidc (KEY) VALUES ('test_rf_gt_nodes_multidc_should_succeed')"
        )

        if cluster.version() >= '4.0':
            warning = 'Your replication factor 2 for keyspace k is higher than the number of nodes 1 for datacenter dc1'
            assert len(node1.grep_log(warning)) == 1
            assert len(node2.grep_log(warning)) == 0

        marks = map(lambda n: n.mark_log(), cluster.nodelist())
        node3 = Node(name='node3',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.3', 9160),
                     storage_interface=('127.0.0.3', 7000),
                     jmx_port='7300',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.3', 9042))
        cluster.add(node3, is_seed=False, data_center="dc1")
        node3.start(wait_for_binary_proto=True)
        if cluster.version() >= '4.0':
            warning = 'is higher than the number of nodes'
            for (node, mark) in zip(cluster.nodelist(), marks):
                assert len(node.grep_log(warning, from_mark=mark)) == 0

        session3 = self.patient_exclusive_cql_connection(node3)
        assert_one(session3, "SELECT * FROM k.testgtrfmultidc",
                   ["test_rf_gt_nodes_multidc_should_succeed"])
    def multiple_repair_test(self):
        cluster = self.cluster
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        cursor = self.patient_cql_connection(node1)
        self.create_ks(cursor, 'ks', 3)
        self.create_cf(cursor, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'})

        debug("insert data")

        for x in range(1, 50):
            insert_c1c2(cursor, x, ConsistencyLevel.ALL)
        node1.flush()

        debug("bringing down node 3")
        node3.flush()
        node3.stop(gently=False)

        debug("inserting additional data into node 1 and 2")
        for y in range(50, 100):
            insert_c1c2(cursor, y, ConsistencyLevel.TWO)
        node1.flush()
        node2.flush()

        debug("restarting and repairing node 3")
        node3.start()

        if cluster.version() >= "3.0":
            node3.repair()
        else:
            node3.nodetool("repair -par -inc")

        debug("stopping node 2")
        node2.stop(gently=False)

        debug("inserting data in nodes 1 and 3")
        for z in range(100, 150):
            insert_c1c2(cursor, z, ConsistencyLevel.TWO)
        node1.flush()
        node3.flush()

        debug("start and repair node 2")
        node2.start()

        if cluster.version() >= "3.0":
            node2.repair()
        else:
            node2.nodetool("repair -par -inc")

        debug("replace node and check data integrity")
        node3.stop(gently=False)
        node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5',9042))
        cluster.add(node5, False)
        node5.start(replace_address = '127.0.0.3', wait_other_notice=True)

        assert_one(cursor, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
Example #15
0
    def resumable_replace_test(self):
        """Test resumable bootstrap while replacing node"""

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster,
                     True, ('127.0.0.4', 9160), ('127.0.0.4', 7000),
                     '7400',
                     '0',
                     None,
                     binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        try:
            node4.start(
                jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"])
        except NodeError:
            pass  # node doesn't start as expected
        t.join()

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node1.start()
        node4.nodetool('bootstrap resume')
        # check if we skipped already retrieved ranges
        node4.watch_log_for("already available. Skipping streaming.")
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...")

        # check if 2nd bootstrap succeeded
        session = self.exclusive_cql_connection(node4)
        rows = list(
            session.execute(
                "SELECT bootstrapped FROM system.local WHERE key='local'"))
        assert len(rows) == 1
        assert rows[0][0] == 'COMPLETED', rows[0][0]

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)
Example #16
0
        def configure_node(self, node: Node, index):
            # set dc/rack manually, since CCM doesn't support custom racks
            node.set_configuration_options({
                'endpoint_snitch':
                'GossipingPropertyFileSnitch',
            })

            rackdc_path = Path(
                node.get_conf_dir()) / 'cassandra-rackdc.properties'

            with open(rackdc_path, 'w') as f:
                f.write(f'dc={node.dc}\nrack={node.rack}\n')
    def decommission_node_test(self):
        debug("decommission_node_test()")
        cluster = self.cluster

        cluster.populate(1)
        # create and add a new node, I must not be a seed, otherwise
        # we get schema disagreement issues for awhile after decommissioning it.
        node2 = Node("node2", cluster, True, ("127.0.0.2", 9160), ("127.0.0.2", 7000), "7200", None)
        cluster.add(node2, False)

        [node1, node2] = cluster.nodelist()
        node1.start()
        node2.start()
        wait(2)

        cursor = self.cql_connection(node1).cursor()
        self.prepare_for_changes(cursor)

        node2.decommission()
        wait(30)

        self.validate_schema_consistent(node1)
        self.make_schema_changes(cursor, namespace="ns1")

        # create and add a new node
        node3 = Node("node3", cluster, True, ("127.0.0.3", 9160), ("127.0.0.3", 7000), "7300", None)

        cluster.add(node3, True)
        node3.start()

        wait(30)
        self.validate_schema_consistent(node1)
Example #18
0
    def test_add_and_remove_node(self):
        """
        Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave.
        @jira_ticket CASSANDRA-11038
        """
        self.cluster.populate(1).start()
        node1 = self.cluster.nodelist()[0]

        waiter = NotificationWaiter(self, node1,
                                    ["STATUS_CHANGE", "TOPOLOGY_CHANGE"])

        # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications
        # don't confuse the state below
        logger.debug("Waiting for unwanted notifications...")
        waiter.wait_for_notifications(timeout=30, num_notifications=2)
        waiter.clear_notifications()

        session = self.patient_cql_connection(node1)
        # reduce system_distributed RF to 2 so we don't require forceful decommission
        session.execute(
            "ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};"
        )
        session.execute(
            "ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};"
        )

        logger.debug("Adding second node...")
        node2 = Node('node2',
                     self.cluster,
                     True,
                     None, ('127.0.0.2', 7000),
                     '7200',
                     '0',
                     None,
                     binary_interface=('127.0.0.2', 9042))
        self.cluster.add(node2, False)
        node2.start()
        logger.debug("Waiting for notifications from {}".format(
            waiter.address))
        notifications = waiter.wait_for_notifications(timeout=120.0,
                                                      num_notifications=2)
        assert 2 == len(notifications), notifications
        for notification in notifications:
            assert get_ip_from_node(node2) == notification["address"][0]
            assert "NEW_NODE" == notifications[0]["change_type"]
            assert "UP" == notifications[1]["change_type"]

        logger.debug("Removing second node...")
        waiter.clear_notifications()
        node2.decommission()
        node2.stop(gently=False)
        logger.debug("Waiting for notifications from {}".format(
            waiter.address))
        notifications = waiter.wait_for_notifications(timeout=120.0,
                                                      num_notifications=2)
        assert 2 == len(notifications), notifications
        for notification in notifications:
            assert get_ip_from_node(node2) == notification["address"][0]
            assert "REMOVED_NODE" == notifications[0]["change_type"]
            assert "DOWN" == notifications[1]["change_type"]
Example #19
0
    def issue_150_test(self):
        self.cluster = Cluster(CLUSTER_PATH, "150", cassandra_version='2.0.9')
        self.cluster.populate([1, 2], use_vnodes=True)
        self.cluster.start()
        dcs = [node.data_center for node in self.cluster.nodelist()]
        dcs.append('dc2')

        node4 = Node('node4', self.cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000),
            '7400', '2000', None)
        self.cluster.add(node4, False, 'dc2')
        node4.start()

        dcs_2 = [node.data_center for node in self.cluster.nodelist()]
        self.assertItemsEqual(dcs, dcs_2)
        node4.nodetool('status')
Example #20
0
    def test_decommission_node(self):
        logger.debug("decommission_node_test()")
        cluster = self.cluster

        cluster.populate(1)
        # create and add a new node, I must not be a seed, otherwise
        # we get schema disagreement issues for awhile after decommissioning it.
        node2 = Node('node2',
                     cluster,
                     True,
                     ('127.0.0.2', 9160),
                     ('127.0.0.2', 7000),
                     '7200',
                     '0',
                     None,
                     binary_interface=('127.0.0.2', 9042))
        cluster.add(node2, False)

        node1, node2 = cluster.nodelist()
        node1.start(wait_for_binary_proto=True)
        node2.start(wait_for_binary_proto=True)
        wait(2)

        session = self.patient_cql_connection(node1)
        self.prepare_for_changes(session)

        node2.decommission()
        wait(30)

        self.validate_schema_consistent(node1)
        self.make_schema_changes(session, namespace='ns1')

        # create and add a new node
        node3 = Node('node3',
                     cluster,
                     True,
                     ('127.0.0.3', 9160),
                     ('127.0.0.3', 7000),
                     '7300',
                     '0',
                     None,
                     binary_interface=('127.0.0.3', 9042))

        cluster.add(node3, True)
        node3.start(wait_for_binary_proto=True)

        wait(30)
        self.validate_schema_consistent(node1)
    def _do_replace(self, same_address=False, jvm_option='replace_address',
                    wait_other_notice=False, wait_for_binary_proto=True,
                    replace_address=None, opts=None, data_center=None,
                    extra_jvm_args=None):
        if replace_address is None:
            replace_address = self.replaced_node.address()

        # only create node if it's not yet created
        if self.replacement_node is None:
            replacement_address = '127.0.0.4'
            if same_address:
                replacement_address = self.replaced_node.address()
                self.cluster.remove(self.replaced_node)

            logger.debug("Starting replacement node {} with jvm_option '{}={}'".format(replacement_address, jvm_option, replace_address))
            self.replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                         thrift_interface=None, storage_interface=(replacement_address, 7000),
                                         jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042))
            if opts is not None:
                logger.debug("Setting options on replacement node: {}".format(opts))
                self.replacement_node.set_configuration_options(opts)
            self.cluster.add(self.replacement_node, False, data_center=data_center)

        if extra_jvm_args is None:
            extra_jvm_args = []
        extra_jvm_args.extend(["-Dcassandra.{}={}".format(jvm_option, replace_address),
                               "-Dcassandra.ring_delay_ms=10000",
                               "-Dcassandra.broadcast_interval_ms=10000"])

        self.replacement_node.start(jvm_args=extra_jvm_args,
                                    wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice)

        if self.cluster.cassandra_version() >= '2.2.8' and same_address:
            self.replacement_node.watch_log_for("Writes will not be forwarded to this node during replacement",
                                                timeout=60)
def bootstrap(node, data_center=None, token=None):
    log.debug('called bootstrap('
              'node={node}, data_center={data_center}, '
              'token={token})')
    node_instance = Node('node%s' % node,
                         get_cluster(),
                         auto_bootstrap=False,
                         thrift_interface=(IP_FORMAT % node, 9160),
                         storage_interface=(IP_FORMAT % node, 7000),
                         binary_interface=(IP_FORMAT % node, 9042),
                         jmx_port=str(7000 + 100 * node),
                         remote_debug_port=0,
                         initial_token=token if token else node * 10)
    get_cluster().add(node_instance, is_seed=False, data_center=data_center)

    try:
        start(node)
    except Exception as e0:
        log.debug('failed 1st bootstrap attempt with: \n{}'.format(e0))
        # Try only twice
        try:
            start(node)
        except Exception as e1:
            log.debug('failed 2nd bootstrap attempt with: \n{}'.format(e1))
            log.error('Added node failed to start twice.')
            raise e1
Example #23
0
 def run(self):
     try:
         if self.options.dse_node:
             node = DseNode(self.name,
                            self.cluster,
                            self.options.bootstrap,
                            self.thrift,
                            self.storage,
                            self.jmx_port,
                            self.remote_debug_port,
                            self.initial_token,
                            binary_interface=self.binary)
         else:
             node = Node(self.name,
                         self.cluster,
                         self.options.bootstrap,
                         self.thrift,
                         self.storage,
                         self.jmx_port,
                         self.remote_debug_port,
                         self.initial_token,
                         binary_interface=self.binary)
         self.cluster.add(node, self.options.is_seed,
                          self.options.data_center)
     except common.ArgumentError as e:
         print_(str(e), file=sys.stderr)
         exit(1)
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042))
        cluster.add(node4, False)

        #try to replace an unassigned ip address
        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.5', wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")
Example #25
0
    def load(path, name):
        cluster_path = os.path.join(path, name)
        filename = os.path.join(cluster_path, 'cluster.conf')
        with open(filename, 'r') as f:
            data = yaml.load(f)
        try:
            cassandra_dir = None
            if 'cassandra_dir' in data:
                cassandra_dir = data['cassandra_dir']
                repository.validate(cassandra_dir)

            cluster = Cluster(path, data['name'], cassandra_dir=cassandra_dir, create_directory=False)
            node_list = data['nodes']
            seed_list = data['seeds']
            if 'partitioner' in data:
                cluster.partitioner = data['partitioner']
            if 'config_options' in data:
                cluster._config_options = data['config_options']
            if 'log_level' in data:
                cluster.__log_level = data['log_level']
        except KeyError as k:
            raise common.LoadError("Error Loading " + filename + ", missing property:" + k)

        for node_name in node_list:
            cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster)
        for seed_name in seed_list:
            cluster.seeds.append(cluster.nodes[seed_name])

        return cluster
Example #26
0
 def create_node(self,
                 name,
                 auto_bootstrap,
                 thrift_interface,
                 storage_interface,
                 jmx_port,
                 remote_debug_port,
                 initial_token,
                 save=True,
                 binary_interface=None,
                 byteman_port='0',
                 environment_variables=None,
                 derived_cassandra_version=None):
     return Node(name,
                 self,
                 auto_bootstrap,
                 thrift_interface,
                 storage_interface,
                 jmx_port,
                 remote_debug_port,
                 initial_token,
                 save,
                 binary_interface,
                 byteman_port,
                 environment_variables,
                 derived_cassandra_version=derived_cassandra_version)
Example #27
0
    def load(path, name):
        cluster_path = os.path.join(path, name)
        filename = os.path.join(cluster_path, 'cluster.conf')
        with open(filename, 'r') as f:
            data = yaml.load(f)
        try:
            cassandra_dir = None
            if 'cassandra_dir' in data:
                cassandra_dir = data['cassandra_dir']
                repository.validate(cassandra_dir)

            cluster = Cluster(path, data['name'], cassandra_dir=cassandra_dir, create_directory=False)
            node_list = data['nodes']
            seed_list = data['seeds']
            if 'partitioner' in data:
                cluster.partitioner = data['partitioner']
            if 'config_options' in data:
                cluster._config_options = data['config_options']
            if 'log_level' in data:
                cluster.__log_level = data['log_level']
        except KeyError as k:
            raise common.LoadError("Error Loading " + filename + ", missing property:" + k)

        for node_name in node_list:
            cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster)
        for seed_name in seed_list:
            cluster.seeds.append(cluster.nodes[seed_name])

        return cluster
Example #28
0
    def load(path, name):
        cluster_path = os.path.join(path, name)
        filename = os.path.join(cluster_path, 'cluster.conf')
        with open(filename, 'r') as f:
            data = yaml.load(f)
        try:
            install_dir = None
            if 'install_dir' in data:
                install_dir = data['install_dir']
                repository.validate(install_dir)
            if install_dir is None and 'cassandra_dir' in data:
                install_dir = data['cassandra_dir']
                repository.validate(install_dir)

            cassandra_version = None
            if 'cassandra_version' in data:
                cassandra_version = LooseVersion(data['cassandra_version'])

            if common.isDse(install_dir):
                cluster = DseCluster(
                    path,
                    data['name'],
                    install_dir=install_dir,
                    create_directory=False,
                    derived_cassandra_version=cassandra_version)
            else:
                cluster = Cluster(path,
                                  data['name'],
                                  install_dir=install_dir,
                                  create_directory=False,
                                  derived_cassandra_version=cassandra_version)
            node_list = data['nodes']
            seed_list = data['seeds']
            if 'partitioner' in data:
                cluster.partitioner = data['partitioner']
            if 'config_options' in data:
                cluster._config_options = data['config_options']
            if 'dse_config_options' in data:
                cluster._dse_config_options = data['dse_config_options']
            if 'misc_config_options' in data:
                cluster._misc_config_options = data['misc_config_options']
            if 'log_level' in data:
                cluster.__log_level = data['log_level']
            if 'use_vnodes' in data:
                cluster.use_vnodes = data['use_vnodes']
            if 'datadirs' in data:
                cluster.data_dir_count = int(data['datadirs'])
            extension.load_from_cluster_config(cluster, data)
        except KeyError as k:
            raise common.LoadError("Error Loading " + filename +
                                   ", missing property:" + k)

        for node_name in node_list:
            cluster.nodes[node_name] = Node.load(cluster_path, node_name,
                                                 cluster)
        for seed in seed_list:
            cluster.seeds.append(seed)

        return cluster
    def replace_active_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        # replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000),
                     jmx_port='7400',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)

        mark = node4.mark_log()
        node4.start(replace_address='127.0.0.3', wait_other_notice=False)
        node4.watch_log_for(
            "java.lang.UnsupportedOperationException: Cannot replace a live node...",
            from_mark=mark)
        assert_not_running(node4)
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000),
                     jmx_port='7400',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)

        # try to replace an unassigned ip address
        mark = node4.mark_log()
        node4.start(replace_address='127.0.0.5', wait_other_notice=False)
        node4.watch_log_for(
            "java.lang.RuntimeException: Cannot replace_address /127.0.0.5 because it doesn't exist in gossip",
            from_mark=mark)
        assert_not_running(node4)
    def resumable_replace_test(self):
        """Test resumable bootstrap while replacing node"""

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        try:
            node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"])
        except NodeError:
            pass  # node doesn't start as expected
        t.join()

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node1.start()
        node4.nodetool('bootstrap resume')
        # check if we skipped already retrieved ranges
        node4.watch_log_for("already available. Skipping streaming.")
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...")

        # check if 2nd bootstrap succeeded
        session = self.exclusive_cql_connection(node4)
        rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'"))
        assert len(rows) == 1
        assert rows[0][0] == 'COMPLETED', rows[0][0]

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)
Example #32
0
    def test_replace(self):
        main_session = self.patient_cql_connection(self.node1)

        # We want the node being replaced to have no data on it so the replacement definitely fetches all the data
        self.node2.stop(wait_other_notice=True)

        for i in range(0, 40):
            self.insert_row(i, i, i, main_session)

        replacement_address = self.node2.address()
        self.node2.stop(wait_other_notice=True)
        self.cluster.remove(self.node2)
        self.node2 = Node('replacement',
                          cluster=self.cluster,
                          auto_bootstrap=True,
                          thrift_interface=None,
                          storage_interface=(replacement_address, 7000),
                          jmx_port='7400',
                          remote_debug_port='0',
                          initial_token=None,
                          binary_interface=(replacement_address, 9042))
        patch_start(self.node2)
        nodes = [self.node1, self.node2, self.node3]
        self.cluster.add(self.node2, False, data_center='datacenter1')
        jvm_args = [
            "-Dcassandra.replace_address=%s" % replacement_address,
            "-Dcassandra.ring_delay_ms=10000",
            "-Dcassandra.broadcast_interval_ms=10000"
        ]
        self.node2.start(jvm_args=jvm_args, wait_for_binary_proto=True)

        sessions = [
            self.exclusive_cql_connection(node)
            for node in [self.node1, self.node2, self.node3]
        ]

        self._everyone_should_have_everything(sessions)

        repair_nodes(nodes)
        cleanup_nodes(nodes)

        self._nodes_have_proper_ranges_after_repair_and_cleanup(sessions)
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log("java.lang.UnsupportedOperationException: Cannot replace a live node...")
        self.assertEqual(len(checkError), 1)
    def replace_with_insufficient_replicas_test(self):
        """
        Test that replace fails when there are insufficient replicas
        @jira_ticket CASSANDRA-11848
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            num_tokens = 1
        else:
            # a little hacky but grep_log returns the whole line...
            num_tokens = int(node3.get_conf_option('num_tokens'))

        debug("testing with num_tokens: {}".format(num_tokens))

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=2)'])

        # stop node to replace
        debug("Stopping node to replace.")
        node3.stop(wait_other_notice=True)

        # stop other replica
        debug("Stopping node2 (other replica)")
        node2.stop(wait_other_notice=True)

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")

        node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0',
                     initial_token=None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=False, wait_other_notice=False)

        # replace should fail due to insufficient replicas
        node4.watch_log_for("Unable to find sufficient sources for streaming range")
        assert_not_running(node4)
Example #35
0
def new_node(cluster, bootstrap=True, token=None, remote_debug_port='2000'):
    i = len(cluster.nodes) + 1
    node = Node('node%s' % i,
                cluster,
                bootstrap,
                ('127.0.0.%s' % i, 9160),
                ('127.0.0.%s' % i, 7000),
                str(7000 + i * 100),
                remote_debug_port,
                token)
    cluster.add(node, not bootstrap)
    return node
Example #36
0
    def populate(self,
                 nodes,
                 debug=False,
                 tokens=None,
                 use_vnodes=False,
                 ipprefix='127.0.0.'):
        node_count = nodes
        dcs = []
        if isinstance(nodes, list):
            self.set_configuration_options(
                values={
                    'endpoint_snitch':
                    'org.apache.cassandra.locator.PropertyFileSnitch'
                })
            node_count = 0
            i = 0
            for c in nodes:
                i = i + 1
                node_count = node_count + c
                for x in xrange(0, c):
                    dcs.append('dc%d' % i)

        if node_count < 1:
            raise common.ArgumentError('invalid node count %s' % nodes)

        for i in xrange(1, node_count + 1):
            if 'node%s' % i in list(self.nodes.values()):
                raise common.ArgumentError(
                    'Cannot create existing node node%s' % i)

        if tokens is None and not use_vnodes:
            tokens = self.balanced_tokens(node_count)

        for i in xrange(1, node_count + 1):
            tk = None
            if tokens is not None and i - 1 < len(tokens):
                tk = tokens[i - 1]
            dc = dcs[i - 1] if i - 1 < len(dcs) else None

            binary = None
            if self.version() >= '1.2':
                binary = ('%s%s' % (ipprefix, i), 9042)
            node = Node('node%s' % i,
                        self,
                        False, ('%s%s' % (ipprefix, i), 9160),
                        ('%s%s' % (ipprefix, i), 7000),
                        str(7000 + i * 100),
                        (str(0), str(2000 + i * 100))[debug == True],
                        tk,
                        binary_interface=binary)
            self.add(node, True, dc)
            self.__update_config()
        return self
Example #37
0
 def _init_new_loading_node(self, ks_name, create_stmt, use_thrift=False):
     loading_node = Node(
         name='node2',
         cluster=self.cluster,
         auto_bootstrap=False,
         thrift_interface=('127.0.0.2', 9160) if use_thrift else None,
         storage_interface=('127.0.0.2', 7000),
         jmx_port='7400',
         remote_debug_port='0',
         initial_token=None,
         binary_interface=('127.0.0.2', 9042)
     )
     logger.debug('adding node')
     self.cluster.add(loading_node, is_seed=True)
     logger.debug('starting new node')
     loading_node.start(wait_for_binary_proto=True)
     logger.debug('recreating ks and table')
     loading_session = self.patient_exclusive_cql_connection(loading_node)
     create_ks(loading_session, ks_name, rf=1)
     logger.debug('creating new table')
     loading_session.execute(create_stmt)
     logger.debug('stopping new node')
     loading_session.cluster.shutdown()
     loading_node.stop()
     return loading_node
Example #38
0
def new_node(cluster, bootstrap=True, token=None, remote_debug_port='0', data_center=None):
    i = len(cluster.nodes) + 1
    node = Node('node%s' % i,
                cluster,
                bootstrap,
                ('127.0.0.%s' % i, 9160),
                ('127.0.0.%s' % i, 7000),
                str(7000 + i * 100),
                remote_debug_port,
                token,
                binary_interface=('127.0.0.%s' % i, 9042))
    cluster.add(node, not bootstrap, data_center=data_center)
    return node
Example #39
0
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(
                ['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(
                ['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version(
        ) >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3',
                            wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log(
            "java.lang.UnsupportedOperationException: Cannot replace a live node..."
        )
        self.assertEqual(len(checkError), 1)
Example #40
0
 def create_node(self,
                 name,
                 auto_bootstrap,
                 thrift_interface,
                 storage_interface,
                 jmx_port,
                 remote_debug_port,
                 initial_token,
                 save=True,
                 binary_interface=None):
     return Node(name, self, auto_bootstrap, thrift_interface,
                 storage_interface, jmx_port, remote_debug_port,
                 initial_token, save, binary_interface)
Example #41
0
    def load(path, name):
        cluster_path = os.path.join(path, name)
        filename = os.path.join(cluster_path, 'cluster.conf')
        with open(filename, 'r') as f:
            data = yaml.safe_load(f)
        try:
            install_dir = None
            if 'install_dir' in data:
                install_dir = data['install_dir']
                repository.validate(install_dir)
            if install_dir is None and 'cassandra_dir' in data:
                install_dir = data['cassandra_dir']
                repository.validate(install_dir)

            if common.isScylla(install_dir):
                cluster = ScyllaCluster(path,
                                        data['name'],
                                        install_dir=install_dir,
                                        create_directory=False)
            elif common.isDse(install_dir):
                cluster = DseCluster(path,
                                     data['name'],
                                     install_dir=install_dir,
                                     create_directory=False)
            else:
                cluster = Cluster(path,
                                  data['name'],
                                  install_dir=install_dir,
                                  create_directory=False)
            node_list = data['nodes']
            seed_list = data['seeds']
            if 'partitioner' in data:
                cluster.partitioner = data['partitioner']
            if 'config_options' in data:
                cluster._config_options = data['config_options']
            if 'log_level' in data:
                cluster.__log_level = data['log_level']
            if 'use_vnodes' in data:
                cluster.use_vnodes = data['use_vnodes']
        except KeyError as k:
            raise common.LoadError("Error Loading " + filename +
                                   ", missing property:" + str(k))

        for node_name in node_list:
            cluster.nodes[node_name] = Node.load(cluster_path, node_name,
                                                 cluster)
        for seed_name in seed_list:
            cluster.seeds.append(cluster.nodes[seed_name])

        return cluster
    def decommission_node_schema_check_test(self):
        cluster = self.cluster

        cluster.populate(1)
        # create and add a non-seed node.
        node2 = Node('node2', 
                    cluster,
                    True,
                    ('127.0.0.2', 9160),
                    ('127.0.0.2', 7000),
                    '7200',
                    None)
        cluster.add(node2, False)

        [node1, node2] = cluster.nodelist()
        node1.start()
        node2.start()
        time.sleep(2)

        node2.decommission()
        time.sleep(30)

        self.validate_schema_consistent(node1)
    def add_and_remove_node_test(self):
        """
        Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave.
        @jira_ticket CASSANDRA-11038
        """
        self.cluster.populate(1).start(wait_for_binary_proto=True)
        node1 = self.cluster.nodelist()[0]

        waiter = NotificationWaiter(self, node1,
                                    ["STATUS_CHANGE", "TOPOLOGY_CHANGE"])

        # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications
        # don't confuse the state below
        debug("Waiting for unwanted notifications...")
        waiter.wait_for_notifications(timeout=30, num_notifications=2)
        waiter.clear_notifications()

        debug("Adding second node...")
        node2 = Node('node2', self.cluster, True, ('127.0.0.2', 9160),
                     ('127.0.0.2', 7000), '7200', '0', None,
                     ('127.0.0.2', 9042))
        self.cluster.add(node2, False)
        node2.start(wait_other_notice=True)
        debug("Waiting for notifications from {}".format(waiter.address))
        notifications = waiter.wait_for_notifications(timeout=60.0,
                                                      num_notifications=2)
        self.assertEquals(2, len(notifications), notifications)
        for notification in notifications:
            self.assertEquals(self.get_ip_from_node(node2),
                              notification["address"][0])
            self.assertEquals("NEW_NODE", notifications[0]["change_type"])
            self.assertEquals("UP", notifications[1]["change_type"])

        debug("Removing second node...")
        waiter.clear_notifications()
        node2.decommission()
        node2.stop(gently=False)
        debug("Waiting for notifications from {}".format(waiter.address))
        notifications = waiter.wait_for_notifications(timeout=60.0,
                                                      num_notifications=2)
        self.assertEquals(2, len(notifications), notifications)
        for notification in notifications:
            self.assertEquals(self.get_ip_from_node(node2),
                              notification["address"][0])
            self.assertEquals("REMOVED_NODE", notifications[0]["change_type"])
            self.assertEquals("DOWN", notifications[1]["change_type"])
Example #44
0
    def load(path, name):
        cluster_path = os.path.join(path, name)
        filename = os.path.join(cluster_path, 'cluster.conf')
        with open(filename, 'r') as f:
            data = yaml.load(f)
        try:
            install_dir = None
            if 'install_dir' in data:
                install_dir = data['install_dir']
                repository.validate(install_dir)
            if install_dir is None and 'cassandra_dir' in data:
                install_dir = data['cassandra_dir']
                repository.validate(install_dir)

            cassandra_version = None
            if 'cassandra_version' in data:
                cassandra_version = LooseVersion(data['cassandra_version'])

            if common.isDse(install_dir):
                cluster = DseCluster(path, data['name'], install_dir=install_dir, create_directory=False, derived_cassandra_version=cassandra_version)
            else:
                cluster = Cluster(path, data['name'], install_dir=install_dir, create_directory=False, derived_cassandra_version=cassandra_version)
            node_list = data['nodes']
            seed_list = data['seeds']
            if 'partitioner' in data:
                cluster.partitioner = data['partitioner']
            if 'config_options' in data:
                cluster._config_options = data['config_options']
            if 'dse_config_options' in data:
                cluster._dse_config_options = data['dse_config_options']
            if 'misc_config_options' in data:
                cluster._misc_config_options = data['misc_config_options']
            if 'log_level' in data:
                cluster.__log_level = data['log_level']
            if 'use_vnodes' in data:
                cluster.use_vnodes = data['use_vnodes']
            if 'datadirs' in data:
                cluster.data_dir_count = int(data['datadirs'])
            extension.load_from_cluster_config(cluster, data)
        except KeyError as k:
            raise common.LoadError("Error Loading " + filename + ", missing property:" + k)

        for node_name in node_list:
            cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster)
        for seed in seed_list:
            cluster.seeds.append(seed)

        return cluster
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)

        # try to replace an unassigned ip address
        mark = node4.mark_log()
        node4.start(replace_address='127.0.0.5', wait_other_notice=False)
        node4.watch_log_for("java.lang.RuntimeException: Cannot replace_address /127.0.0.5 because it doesn't exist in gossip", from_mark=mark)
        self.check_not_running(node4)
    def test_replace(self):
        main_session = self.patient_cql_connection(self.node1)

        #We want the node being replaced to have no data on it so the replacement definitely fetches all the data
        self.node2.stop(wait_other_notice=True)

        for i in range(0, 40):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        replacement_address = self.node2.address()
        self.node2.stop(wait_other_notice=True)
        self.cluster.remove(self.node2)
        self.node2 = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                         thrift_interface=None, storage_interface=(replacement_address, 7000),
                                         jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042))
        patch_start(self.node2)
        nodes = [self.node1, self.node2, self.node3]
        self.cluster.add(self.node2, False, data_center='datacenter1')
        jvm_args = ["-Dcassandra.replace_address=%s" % replacement_address,
                    "-Dcassandra.ring_delay_ms=10000",
                    "-Dcassandra.broadcast_interval_ms=10000"]
        self.node2.start(jvm_args=jvm_args, wait_for_binary_proto=True, wait_other_notice=True)

        sessions = [self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3]]

        # Everyone should have everything
        expected = [gen_expected(range(0, 40)),
                    gen_expected(range(0, 40)),
                    gen_expected(range(0,40))]

        self.check_replication(sessions, exactly=3)
        self.check_expected(sessions, expected)

        repair_nodes(nodes)
        cleanup_nodes(nodes)

        self.check_replication(sessions, exactly=2)

        expected = [gen_expected(range(0,11), range(21,40)),
                    gen_expected(range(0,21), range(31, 40)),
                    gen_expected(range(11,31))]
        self.check_expected(sessions, expected)
    def test_add_and_remove_node(self):
        """
        Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave.
        @jira_ticket CASSANDRA-11038
        """
        self.cluster.populate(1).start(wait_for_binary_proto=True)
        node1 = self.cluster.nodelist()[0]

        waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"])

        # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications
        # don't confuse the state below
        logger.debug("Waiting for unwanted notifications...")
        waiter.wait_for_notifications(timeout=30, num_notifications=2)
        waiter.clear_notifications()

        session = self.patient_cql_connection(node1)
        # reduce system_distributed RF to 2 so we don't require forceful decommission
        session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};")
        session.execute("ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};")

        logger.debug("Adding second node...")
        node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042))
        self.cluster.add(node2, False)
        node2.start(wait_other_notice=True)
        logger.debug("Waiting for notifications from {}".format(waiter.address))
        notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2)
        assert 2 == len(notifications), notifications
        for notification in notifications:
            assert get_ip_from_node(node2) == notification["address"][0]
            assert "NEW_NODE" == notifications[0]["change_type"]
            assert "UP" == notifications[1]["change_type"]

        logger.debug("Removing second node...")
        waiter.clear_notifications()
        node2.decommission()
        node2.stop(gently=False)
        logger.debug("Waiting for notifications from {}".format(waiter.address))
        notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2)
        assert 2 == len(notifications), notifications
        for notification in notifications:
            assert get_ip_from_node(node2) == notification["address"][0]
            assert "REMOVED_NODE" == notifications[0]["change_type"]
            assert "DOWN" == notifications[1]["change_type"]
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        # replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)

        mark = node4.mark_log()
        node4.start(replace_address='127.0.0.3', wait_other_notice=False)
        node4.watch_log_for("java.lang.UnsupportedOperationException: Cannot replace a live node...", from_mark=mark)
        self.check_not_running(node4)
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0',
                     initial_token=None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)

        # try to replace an unassigned ip address
        mark = node4.mark_log()
        node4.start(replace_address='127.0.0.5', wait_other_notice=False)
        node4.watch_log_for("java.lang.RuntimeException: Cannot replace_address /127.0.0.5 because it doesn't exist in gossip", from_mark=mark)
        assert_not_running(node4)
    def replace_active_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        # replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0',
                     initial_token=None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)

        mark = node4.mark_log()
        node4.start(replace_address='127.0.0.3', wait_other_notice=False)
        node4.watch_log_for("java.lang.UnsupportedOperationException: Cannot replace a live node...", from_mark=mark)
        assert_not_running(node4)
Example #51
0
    def load(path, name):
        cluster_path = os.path.join(path, name)
        filename = os.path.join(cluster_path, "cluster.conf")
        with open(filename, "r") as f:
            data = yaml.load(f)
        try:
            install_dir = None
            if "install_dir" in data:
                install_dir = data["install_dir"]
                repository.validate(install_dir)
            if install_dir is None and "cassandra_dir" in data:
                install_dir = data["cassandra_dir"]
                repository.validate(install_dir)

            if common.isDse(install_dir):
                cluster = DseCluster(path, data["name"], install_dir=install_dir, create_directory=False)
            else:
                cluster = Cluster(path, data["name"], install_dir=install_dir, create_directory=False)
            node_list = data["nodes"]
            seed_list = data["seeds"]
            if "partitioner" in data:
                cluster.partitioner = data["partitioner"]
            if "config_options" in data:
                cluster._config_options = data["config_options"]
            if "log_level" in data:
                cluster.__log_level = data["log_level"]
            if "use_vnodes" in data:
                cluster.use_vnodes = data["use_vnodes"]
        except KeyError as k:
            raise common.LoadError("Error Loading " + filename + ", missing property:" + k)

        for node_name in node_list:
            cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster)
        for seed_name in seed_list:
            cluster.seeds.append(cluster.nodes[seed_name])

        return cluster
    def add_and_remove_node_test(self):
        """
        Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave.
        @jira_ticket CASSANDRA-11038
        """
        self.cluster.populate(1).start(wait_for_binary_proto=True)
        node1 = self.cluster.nodelist()[0]

        waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"])

        # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications
        # don't confuse the state below
        debug("Waiting for unwanted notifications...")
        waiter.wait_for_notifications(timeout=30, num_notifications=2)
        waiter.clear_notifications()

        debug("Adding second node...")
        node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042))
        self.cluster.add(node2, False)
        node2.start(wait_other_notice=True)
        debug("Waiting for notifications from {}".format(waiter.address))
        notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2)
        self.assertEquals(2, len(notifications), notifications)
        for notification in notifications:
            self.assertEquals(get_ip_from_node(node2), notification["address"][0])
            self.assertEquals("NEW_NODE", notifications[0]["change_type"])
            self.assertEquals("UP", notifications[1]["change_type"])

        debug("Removing second node...")
        waiter.clear_notifications()
        node2.decommission()
        node2.stop(gently=False)
        debug("Waiting for notifications from {}".format(waiter.address))
        notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2)
        self.assertEquals(2, len(notifications), notifications)
        for notification in notifications:
            self.assertEquals(get_ip_from_node(node2), notification["address"][0])
            self.assertEquals("REMOVED_NODE", notifications[0]["change_type"])
            self.assertEquals("DOWN", notifications[1]["change_type"])
    def _replace_node_test(self, gently):
        """
        Check that the replace address function correctly replaces a node that has failed in a cluster.
        Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter.
        Check that tokens are migrated and that data is replicated properly.
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            numNodes = 1
        else:
            # a little hacky but grep_log returns the whole line...
            numNodes = int(node3.get_conf_option('num_tokens'))

        debug(numNodes)

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        session.default_timeout = 45
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=gently, wait_other_notice=True)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
                session.execute(query)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")

        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)

        # query should work again
        debug("Verifying querying works again.")
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(checkCollision)
        self.assertEqual(len(checkCollision), 1)
    def replace_with_reset_resume_state_test(self):
        """Test replace with resetting bootstrap progress"""

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        # keep timeout low so that test won't hang
        node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000})
        cluster.add(node4, False)
        try:
            node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False)
        except NodeError:
            pass  # node doesn't start as expected
        t.join()
        node1.start()

        # restart node4 bootstrap with resetting bootstrap state
        node4.stop()
        mark = node4.mark_log()
        node4.start(jvm_args=[
                    "-Dcassandra.replace_address_first_boot=127.0.0.3",
                    "-Dcassandra.reset_bootstrap_progress=true"
                    ])
        # check if we reset bootstrap state
        node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark)
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...", from_mark=mark)

        # check if 2nd bootstrap succeeded
        session = self.exclusive_cql_connection(node4)
        rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'"))
        assert len(rows) == 1
        assert rows[0][0] == 'COMPLETED', rows[0][0]

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)
    def replace_first_boot_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            numNodes = 1
        else:
            # a little hacky but grep_log returns the whole line...
            numNodes = int(node3.get_conf_option('num_tokens'))

        debug(numNodes)

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=False)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                session.execute(query, timeout=30)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True)

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(checkCollision)
        self.assertEqual(len(checkCollision), 1)

        # restart node4 (if error's might have to change num_tokens)
        node4.stop(gently=False)
        node4.start(wait_for_binary_proto=True, wait_other_notice=False)

        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        # we redo this check because restarting node should not result in tokens being moved again, ie number should be same
        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)