Esempio n. 1
0
    def _common_test_body(self, moxi=False):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        bucket_data = RebalanceBaseTest.bucket_data_init(rest)

        for server in self.servers[1:]:
            self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
            self.log.info("adding node {0}:{1} and rebalance afterwards".format(server.ip, server.port))
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port)
            msg = "unable to add node {0} to the cluster {1}"
            self.assertTrue(otpNode, msg.format(server.ip, master.ip))
            for name in bucket_data:
                inserted_keys, rejected_keys = \
                MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.servers[0]],
                    name=name,
                    ram_load_ratio= -1,
                    number_of_items=self.keys_count,
                    number_of_threads=1,
                    write_only=True)
                rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[])
                self.assertTrue(rest.monitorRebalance(),
                    msg="rebalance operation failed after adding node {0}".format(server.ip))
                self.log.info("completed rebalancing in server {0}".format(server))
                IncrementalRebalanceWithParallelReadTests._reader_thread(self, inserted_keys, bucket_data, moxi=moxi)
                self.assertTrue(rest.monitorRebalance(),
                    msg="rebalance operation failed after adding node {0}".format(server.ip))
                break
Esempio n. 2
0
    def _failover_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        num_initial_servers = self.num_initial_servers
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds[0] = content

        self.log.info("FAILOVER PHASE")
        # Failover selected nodes
        for node in optNodesIds:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)
            self.assertTrue(rest.monitorRebalance(),
                msg="failed after failover of {0}".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.fail_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(new_swap_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 3
0
    def _failover_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        num_initial_servers = self.num_initial_servers
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds[0] = content

        self.log.info("FAILOVER PHASE")
        # Failover selected nodes
        for node in optNodesIds:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.fail_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(new_swap_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 4
0
    def _common_test_body(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        rebalanced_servers = [master]
        bucket_data = RebalanceBaseTest.bucket_data_init(rest)

        self.log.info("INTIAL LOAD")
        RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio,
            keys_count=self.keys_count)

        for name in bucket_data:
            for thread in bucket_data[name]["threads"]:
                bucket_data[name]["items_inserted_count"] += thread.inserted_keys_count()

        for server in self.servers[1:]:
            self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
            #do this 2 times , start rebalance , failover the node , remove the node and rebalance
            for i in range(0, self.num_rebalance):
                distribution = RebalanceBaseTest.get_distribution(self.load_ratio)
                RebalanceBaseTest.load_data_for_buckets(rest, self.load_ratio, distribution, [master], bucket_data,
                    self)
                self.log.info("adding node {0} and rebalance afterwards".format(server.ip))
                otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port)
                msg = "unable to add node {0} to the cluster {1}"
                self.assertTrue(otpNode, msg.format(server.ip, master.ip))
                rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[])
                self.assertTrue(rest.monitorRebalance(),
                    msg="rebalance operation failed after adding node {0}".format(server.ip))
                rebalanced_servers.append(server)
                RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self, True)
                rest.fail_over(otpNode.id)
                self.log.info("failed over {0}".format(otpNode.id))
                time.sleep(10)
                rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                    ejectedNodes=[otpNode.id])
                msg = "rebalance failed while removing failover nodes {0}".format(otpNode.id)
                self.assertTrue(rest.monitorRebalance(), msg=msg)
                #now verify the numbers again ?
                RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self, True)
                #wait 6 minutes
                time.sleep(6 * 60)

            self.log.info("adding node {0} and rebalance afterwards".format(server.ip))
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port)
            msg = "unable to add node {0} to the cluster {1}"
            self.assertTrue(otpNode, msg.format(server.ip, master.ip))
            distribution = RebalanceBaseTest.get_distribution(self.load_ratio)
            RebalanceBaseTest.load_data_for_buckets(rest, self.load_ratio, distribution, rebalanced_servers, bucket_data, self)
            rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[])
            self.assertTrue(rest.monitorRebalance(),
                msg="rebalance operation failed after adding node {0}".format(server.ip))
            rebalanced_servers.append(server)
            RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self, True)
Esempio n. 5
0
    def _common_test_body(self):
        master = self.servers[0]
        rest = RestConnection(master)
        bucket_data = RebalanceBaseTest.bucket_data_init(rest)

        self.log.info("INTIAL LOAD")
        RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio,
            keys_count=self.keys_count)

        rebalance_out = False
        for server in self.servers[1:]:
            if rebalance_out:
                # Pick a node to rebalance out, other than master
                ejectedNodes = [RebalanceHelper.pick_node(master)]
            else:
                ejectedNodes = []
            current_nodes = RebalanceHelper.getOtpNodeIds(master)
            self.log.info("current nodes : {0}".format(current_nodes))
            self.log.info("adding node {0}, removing node {1} and rebalance afterwards".format(server.ip,
                [node.ip for node in ejectedNodes]))

            self.log.info("START PARALLEL LOAD")
            RebalanceBaseTest.tasks_for_buckets(rest, self.task_manager, bucket_data,
                DELETE_RATIO=self.delete_ratio,
                ACCESS_RATIO=self.access_ratio, EXPIRY_RATIO=self.expiry_ratio)

            self.log.info("INCREMENTAL REBALANCE IN/OUT")
            # rebalance in/out a server
            RebalanceTaskHelper.add_rebalance_task(self.task_manager,
                [master],
                [server],
                ejectedNodes, do_stop=self.do_stop)
            # wait for loading tasks to finish
            RebalanceBaseTest.finish_all_bucket_tasks(rest, bucket_data)

            # Make sure we have at least 3 nodes, for replica=2
            if len(current_nodes) > 2:
                rebalance_out = True

        if self.do_verify:
            self.log.info("VERIFICATION")
            RebalanceBaseTest.do_kv_and_replica_verification(master,
                self.task_manager,
                bucket_data,
                self.replica,
                self)
        else:
            self.log.info("NO VERIFICATION")
Esempio n. 6
0
    def test_failover_source_sets(self):
        replication_type = "continuous"
        self.log.info("Force initial rebalance.")

        # This test starts with a 2-2 unidirectional replication from cluster a
        # to cluster b; during the replication, we trigger failover of one node
        # on source cluster , resulting a  1-2 replication.
        # After all loading finish, verify data and rev on both clusters.
        replication_type = "continuous"
        self.log.info("Force initial rebalance.")

        cluster_ref_a = "cluster_ref_a"
        master_a = self._input.clusters.get(0)[0]
        rest_conn_a = RestConnection(master_a)

        cluster_ref_b = "cluster_ref_b"
        master_b = self._input.clusters.get(1)[0]
        rest_conn_b = RestConnection(master_b)

        self.log.info("START XDC replication...")

        # Start replication
        rest_conn_a.add_remote_cluster(master_b.ip, master_b.port,
                                       master_b.rest_username,
                                       master_b.rest_password, cluster_ref_b)
        (rep_database, rep_id) = rest_conn_a.start_replication(replication_type,
                                                               self._buckets[0],
                                                               cluster_ref_b)
        self._state.append((rest_conn_a, cluster_ref_b, rep_database, rep_id))

        # Start load
        self.log.info("START loading data...")
        load_thread_list = []
        kvstore = ClientKeyValueStore()
        self._params["ops"] = "set"
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread.start()
        # sleep a while to allow more data loaded
        time.sleep(5)

        self.log.info("current nodes on source cluster: {0}".format(RebalanceHelper.getOtpNodeIds(master_a)))

        # Trigger failover, we fail over one node each time until there is only one node remaining
        self.log.info("DURING replication, start failover...")
        self.log.info("FAILOVER nodes on Cluster A ...")
        nodes_a = rest_conn_a.node_statuses()
        while len(nodes_a) > 1:
            toBeFailedOverNode = RebalanceHelper.pick_node(master_a)
            self.log.info("failover node {0}".format(toBeFailedOverNode.id))
            rest_conn_a.fail_over(toBeFailedOverNode)
            self.log.info("rebalance after failover")
            rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()], \
                ejectedNodes=[toBeFailedOverNode.id])
            self.assertTrue(rest_conn_a.monitorRebalance(),
                msg="rebalance operation failed after removing node {0}".format(toBeFailedOverNode.id))
            nodes_a = rest_conn_a.node_statuses()

        self.log.info("ALL failed over done...")

        # Wait for loading threads to finish
        for lt in load_thread_list:
            lt.join()
        self.log.info("All loading threads finished")

        # Verify replication
        self.log.info("START data verification at cluster A...")
        self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_a,
            self._buckets[0],
            kvstore,
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated data failed")

        self.log.info("START data verification at cluster B...")
        self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b,
            self._buckets[0],
            kvstore,
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated data failed")

        self.log.info("START revision verification on both clusters...")
        self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a,
            rest_conn_b,
            self._buckets[0],
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated revisions failed")
Esempio n. 7
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)
        SwapRebalanceBase.sleep(self, 10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached == 100 and not RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            self.log.info("Latest logs from UI:")
            for i in rest.get_logs(): self.log.error(i)
            self.fail("rebalance failed even before killing memcached")
        bucket = rest.get_buckets()[0].name
        pid = None
        if self.swap_orchestrator:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
            pid = o[0]
            shell.disconnect()
        else:
            for i in xrange(2):
                try:
                    _mc = MemcachedClientHelper.direct_client(master, bucket)
                    pid = _mc.stats()["pid"]
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    SwapRebalanceBase.sleep(self, 1)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
        self.log.info("sleep for 10 sec after kill memcached")
        SwapRebalanceBase.sleep(self, 10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
        i = 0
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            SwapRebalanceBase.sleep(self, 30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance")
            knownNodes = rest.node_statuses();
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes)
            self.assertTrue(rest.monitorRebalance(),
                            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 8
0
    def _add_back_failed_node(self, do_node_cleanup=False):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        # List of servers that will not be failed over
        not_failed_over = []
        for server in self.servers:
            if server.ip not in [node.ip for node in toBeEjectedNodes]:
                not_failed_over.append(server)
                self.log.info("Node %s not failed over" % server.ip)

        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content
            master = not_failed_over[-1]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        # Failover selected nodes
        for node in optNodesIds:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(optNodesIds))

        # Add back the same failed over nodes

        # Cleanup the node, somehow
        # TODO: cluster_run?
        if do_node_cleanup:
            pass

        # Make rest connection with node part of cluster
        rest = RestConnection(master)

        # Given the optNode, find ip
        add_back_servers = []
        nodes = rest.get_nodes()
        for server in [node.ip for node in nodes]:
            if isinstance(server, unicode):
                add_back_servers.append(server)
        final_add_back_servers = []
        for server in self.servers:
            if server.ip not in add_back_servers:
                final_add_back_servers.append(server)

        for server in final_add_back_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[])

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(add_back_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 9
0
    def test_rebalance_inout_with_durability_failure(self):
        """
        Perform irregular number of in_out nodes
        1. Swap-out 'self.nodes_out' nodes
        2. Add nodes using 'self.nodes_in' such that,
           replica_number > nodes_in_cluster
        3. Perform swap-rebalance
        4. Make sure durability is not broken due to swap-rebalance
        5. Add make a node and do CRUD on the bucket
        6. Verify durability works after node addition

        Note: This is a Negative case. i.e: Durability will be broken
        """
        master = self.cluster.master
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        def_bucket = self.bucket_util.buckets[0]

        # TODO: Enable verification
        """
        vbucket_info_dict = dict()

        # Cb stat object for verification purpose
        master_shell_conn = RemoteMachineShellConnection(master)
        master_node_cb_stat = Cbstats(master_shell_conn)

        # Update each vbucket's seq_no for latest value for verification
        for vb_num in range(0, self.vbuckets):
            vbucket_info_dict[vb_num] = master_node_cb_stat.vbucket_seqno(
                def_bucket.name, vb_num, "abs_high_seqno")
        """

        # Rest connection to add/rebalance/monitor nodes
        rest = RestConnection(master)

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.nodes_out)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            if self.nodes_out is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.nodes_in]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if self.do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        self.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        self.sleep(1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        # TODO: There will be failure in doc_count verification due to
        # swap_rebalance. Need to update verification steps accordingly to
        # satisfy this
        self.verification_phase()

        # Add back first ejected node back into the cluster
        self.task.rebalance(self.cluster.nodes_in_cluster,
                            [toBeEjectedNodes[0]], [])

        # Load doc into all vbuckets to verify durability
        gen_create = doc_generator('test_', 0, self.num_items)
        task = self.task.async_load_gen_docs_atomicity(
            self.cluster,
            def_bucket,
            gen_create,
            self.op_type,
            exp=0,
            batch_size=10,
            process_concurrency=8,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            timeout_secs=self.sdk_timeout,
            retries=self.sdk_retries,
            transaction_timeout=self.transaction_timeout,
            commit=self.transaction_commit)
        self.task_manager.get_task_result(task)
Esempio n. 10
0
    def _common_test_body_swap_rebalance(self, do_stop_start=False):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)

        if do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info("STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                              format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error("rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info("Rebalance will be stopped with {0}%".format(progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped, msg="unable to stop rebalance")
                        SwapRebalanceBase.sleep(self, 20)
                        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        SwapRebalanceBase.sleep(self, 1)
        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(optNodesIds))
        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 11
0
    def _add_back_failed_node(self, do_node_cleanup=False):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        # Cluster all servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, _ = RebalanceHelper.rebalance_in(self.servers,
                                                 len(self.servers) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(
            master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        # List of servers that will not be failed over
        not_failed_over = []
        for server in self.servers:
            if self.cluster_run:
                if server.port not in [node.port for node in toBeEjectedNodes]:
                    not_failed_over.append(server)
                    self.log.info("Node {0}:{1} not failed over".format(
                        server.ip, server.port))
            else:
                if server.ip not in [node.ip for node in toBeEjectedNodes]:
                    not_failed_over.append(server)
                    self.log.info("Node {0}:{1} not failed over".format(
                        server.ip, server.port))

        if self.fail_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content
            master = not_failed_over[-1]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = self.start_access_phase()

        # Failover selected nodes
        for node in optNodesIds:
            self.log.info(
                "failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))

        # Add back the same failed over nodes

        # Cleanup the node, somehow
        # TODO: cluster_run?
        if do_node_cleanup:
            pass

        # Make rest connection with node part of cluster
        rest = RestConnection(master)

        # Given the optNode, find ip
        add_back_servers = []
        nodes = rest.get_nodes()
        for server in nodes:
            if isinstance(server.ip, unicode):
                add_back_servers.append(server)
        final_add_back_servers = []
        for server in self.servers:
            if self.cluster_run:
                if server.port not in [serv.port for serv in add_back_servers]:
                    final_add_back_servers.append(server)
            else:
                if server.ip not in [serv.ip for serv in add_back_servers]:
                    final_add_back_servers.append(server)
        for server in final_add_back_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=[])

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                add_back_servers))

        self.verification_phase()
Esempio n. 12
0
    def test_rebalance_inout_with_durability_check(self):
        """
        Perform irregular number of in_out nodes
        1. Swap-out 'self.nodes_out' nodes
        2. Add 'self.nodes_in' nodes into the cluster
        3. Perform swap-rebalance
        4. Make sure durability is not broken due to swap-rebalance

        Note: This is a Positive case. i.e: Durability should not be broken
        """
        master = self.cluster.master
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        def_bucket = self.bucket_util.buckets[0]

        # Update replica value before performing rebalance in/out
        if self.replica_to_update:
            bucket_helper = BucketHelper(self.cluster.master)

            # Recalculate replicate_to/persist_to as per new replica value
            if self.self.durability_level is None:
                self.replicate_to = floor(self.replica_to_update / 2) + 1
                self.persist_to = floor(self.replica_to_update / 2) + 2

            # Update bucket replica to new value as given in conf file
            self.log.info("Updating replica count of bucket to {0}".format(
                self.replica_to_update))
            bucket_helper.change_bucket_props(
                def_bucket.name, replicaNumber=self.replica_to_update)

        # Rest connection to add/rebalance/monitor nodes
        rest = RestConnection(master)

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.nodes_out)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            if self.nodes_out is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.nodes_in]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if self.do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        self.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        self.sleep(1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        self.verification_phase()
Esempio n. 13
0
    def _common_test_body_swap_rebalance(self, do_stop_start=False):
        master = self.cluster.master
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[1:num_initial_servers]

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status = self.task.rebalance(self.cluster.servers[:self.nodes_init],
                                     intial_severs, [])
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        self.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        self.sleep(1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        self.verification_phase()
Esempio n. 14
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, _ = RebalanceHelper.rebalance_in(intial_severs,
                                                 len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)
        self.sleep(10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(
            self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached and RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            rest.print_UI_logs()
            self.fail("rebalance failed even before killing memcached")
        bucket = self.bucket_util.buckets[0]
        pid = None
        if self.swap_orchestrator and not self.cluster_run:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            pid = shell.get_memcache_pid()
            shell.disconnect()
        else:
            times = 2
            if self.cluster_run:
                times = 20
            for _ in xrange(times):
                try:
                    shell = RemoteMachineShellConnection(server)
                    pid = shell.get_memcache_pid()
                    shell.disconnect()
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    self.sleep(2)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port,
                                                      killed))
        self.log.info("sleep for 10 sec after kill memcached")
        self.sleep(10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            self.bucket_util._wait_warmup_completed([master],
                                                    bucket,
                                                    wait_time=600)
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            self.sleep(30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(),
                             msg="cluster need rebalance")
            knownNodes = rest.node_statuses()
            self.log.info("nodes are still in cluster: {0}".format([
                (node.ip, node.port) for node in knownNodes
            ]))
            ejectedNodes = list(
                set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                           ejectedNodes=ejectedNodes)
            self.assertTrue(
                rest.monitorRebalance(),
                msg="Rebalance failed after adding node {0}".format(
                    toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        self.verification_phase()
Esempio n. 15
0
    def test_incremental_rebalance_out_continuous_bidirectional_sets_deletes(self):
        cluster_ref_a = "cluster_ref_a"
        master_a = self._input.clusters.get(0)[0]
        rest_conn_a = RestConnection(master_a)

        cluster_ref_b = "cluster_ref_b"
        master_b = self._input.clusters.get(1)[0]
        rest_conn_b = RestConnection(master_b)

        # Setup bi-directional continuous replication
        replication_type = "continuous"

        rest_conn_a.add_remote_cluster(master_b.ip, master_b.port,
            master_b.rest_username,
            master_b.rest_password, cluster_ref_b)
        rest_conn_b.add_remote_cluster(master_a.ip, master_a.port,
            master_a.rest_username,
            master_a.rest_password, cluster_ref_a)
        (rep_database_a, rep_id_a) = rest_conn_a.start_replication(
            replication_type, self._buckets[0],
            cluster_ref_b)
        (rep_database_b, rep_id_b) = rest_conn_b.start_replication(
            replication_type, self._buckets[0],
            cluster_ref_a)
        self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a))
        self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b))

        load_thread_list = []
        # Start load
        kvstore = ClientKeyValueStore()

        self._params["ops"] = "set"
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread.start()
        load_thread.join()

        # Do some deletes
        self._params["ops"] = "delete"
        self._params["count"] = self._num_items/5
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread_list.append(load_thread)

        # Start all loads concurrently
        for lt in load_thread_list:
            lt.start()

        # Trigger rebalance on both source and destination clusters
        servers_a = self._input.clusters.get(0)
        servers_b = self._input.clusters.get(1)
        rebalanced_servers_a = []
        rebalanced_servers_b = []
        which_servers_a = []
        which_servers_b = []

        # Rebalance all the nodes together
        RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1)

        RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1)
        rebalanced_servers_a.extend(servers_a)
        rebalanced_servers_b.extend(servers_b)

        nodes_a = rest_conn_a.node_statuses()
        nodes_b = rest_conn_b.node_statuses()
        # Incremental rebalance out one node in cluster_a, then cluster_b
        while len(nodes_a) > 1:
            toBeEjectedNode = RebalanceHelper.pick_node(master_a)

            self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master_a)))
            self.log.info("removing node {0} and rebalance afterwards".format(toBeEjectedNode.id))
            rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()], \
                ejectedNodes=[toBeEjectedNode.id])
            self.assertTrue(rest_conn_a.monitorRebalance(),
                msg="rebalance operation failed after adding node {0}".format(toBeEjectedNode.id))

            while len(nodes_b) > 1:
                toBeEjectedNode = RebalanceHelper.pick_node(master_b)
                self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master_b)))
                self.log.info("removing node {0} and rebalance afterwards".format(toBeEjectedNode.id))
                rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\
                    ejectedNodes=[toBeEjectedNode.id])
                self.assertTrue(rest_conn_b.monitorRebalance(),
                    msg="rebalance operation failed after adding node {0}".format(toBeEjectedNode.id))
                break

            for node in nodes_b:
                for rebalanced_server in rebalanced_servers_b:
                    if rebalanced_server.ip.find(node.ip) != -1:
                        rebalanced_servers_b.remove(rebalanced_server)
                        break
            nodes_b = rest_conn_a.node_statuses()

            for node in nodes_a:
                for rebalanced_server in rebalanced_servers_a:
                    if rebalanced_server.ip.find(node.ip) != -1:
                        rebalanced_servers_a.remove(rebalanced_server)
                        break
            nodes_a = rest_conn_a.node_statuses()

            for node in nodes_a:
                for rebalanced_server in rebalanced_servers_a:
                    if rebalanced_server.ip.find(node.ip) != -1:
                        rebalanced_servers_a.remove(rebalanced_server)
                        break

            nodes_a= rest_conn_a.node_statuses()

        # Wait for loading threads to finish
        for lt in load_thread_list:
            lt.join()
        self.log.info("All loading threads finished")
        # Verify replication
        self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b,
            self._buckets[0],
            kvstore,
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated data failed")
        self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a,
            rest_conn_b,
            self._buckets[0],
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated revisions failed")
Esempio n. 16
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs)-1)

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        self.log.info("DATA LOAD PHASE")
        loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers+self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\
            ejectedNodes=optNodesIds)

        # Rebalance is failed at 20%, 40% and 60% completion
        for i in [1, 2, 3]:
            expected_progress = 20*i
            self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress))
            reached = RestHelper(rest).rebalance_reached(expected_progress)
            command = "[erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)]."
            memcached_restarted = rest.diag_eval(command)
            self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval")
            time.sleep(20)

            rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\
                ejectedNodes=optNodesIds)

        # Stop loaders
        SwapRebalanceBase.stop_load(loaders)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))

        self.log.info("DONE DATA ACCESS PHASE")
        #for bucket in rest.get_buckets():
        #    SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\
        #        bucket.name, self)
        #    RebalanceHelper.wait_for_persistence(master, bucket.name)

        self.log.info("VERIFICATION PHASE")
        SwapRebalanceBase.items_verification(master, self)
Esempio n. 17
0
    def test_failover_continuous_bidirectional_sets_deletes(self):
        cluster_ref_a = "cluster_ref_a"
        master_a = self._input.clusters.get(0)[0]
        rest_conn_a = RestConnection(master_a)

        cluster_ref_b = "cluster_ref_b"
        master_b = self._input.clusters.get(1)[0]
        rest_conn_b = RestConnection(master_b)

        # Rebalance all the nodes together
        servers_a = self._input.clusters.get(0)
        servers_b = self._input.clusters.get(1)
        rebalanced_servers_a = []
        rebalanced_servers_b = []

        RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1)
        RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1)
        rebalanced_servers_a.extend(servers_a)
        rebalanced_servers_b.extend(servers_b)

        # Setup bi-directional continuous replication
        replication_type = "continuous"

        rest_conn_a.add_remote_cluster(master_b.ip, master_b.port,
            master_b.rest_username,
            master_b.rest_password, cluster_ref_b)
        rest_conn_b.add_remote_cluster(master_a.ip, master_a.port,
            master_a.rest_username,
            master_a.rest_password, cluster_ref_a)
        (rep_database_a, rep_id_a) = rest_conn_a.start_replication(
            replication_type, self._buckets[0],
            cluster_ref_b)
        (rep_database_b, rep_id_b) = rest_conn_b.start_replication(
            replication_type, self._buckets[0],
            cluster_ref_a)

        load_thread_list = []
        # Start load
        kvstore = ClientKeyValueStore()

        self._params["ops"] = "set"
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread.start()
        load_thread.join()
        RebalanceHelper.wait_for_persistence(master_a, self._buckets[0])

        # Do some deletes
        self._params["ops"] = "delete"
        self._params["count"] = self._num_items/5
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread_list.append(load_thread)

        # Start all loads concurrently
        for lt in load_thread_list:
            lt.start()

        # Do the failover of nodes on both clusters
        self.log.info("Failing over nodes")
        self.log.info("current nodes on cluster 1: {0}".format(RebalanceHelper.getOtpNodeIds(master_a)))
        self.log.info("current nodes on cluster 2: {0}".format(RebalanceHelper.getOtpNodeIds(master_b)))

        # Find nodes to be failed_over
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master_a, howmany=self._failover_factor)
        optNodesIds_a = [node.id for node in toBeEjectedNodes]
        if self._fail_orchestrator_a:
            status, content = ClusterOperationHelper.find_orchestrator(master_a)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds_a[0] = content
            master_a = self._input.clusters.get(0)[-1]
            rest_conn_a = RestConnection(master_a)

        #Failover selected nodes
        for node in optNodesIds_a:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest_conn_a.fail_over(node)

        toBeEjectedNodes = RebalanceHelper.pick_nodes(master_b, howmany=self._failover_factor)
        optNodesIds_b = [node.id for node in toBeEjectedNodes]
        if self._fail_orchestrator_b:
            status, content = ClusterOperationHelper.find_orchestrator(master_b)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds_b[0] = content
            master_b = self._input.clusters.get(1)[-1]
            rest_conn_b = RestConnection(master_b)

        self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a))
        self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b))

        #Failover selected nodes
        for node in optNodesIds_b:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest_conn_b.fail_over(node)

        rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()],\
            ejectedNodes=optNodesIds_a)
        rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\
            ejectedNodes=optNodesIds_b)

        self.assertTrue(rest_conn_a.monitorRebalance(),
            msg="rebalance operation failed after adding node on cluster 1")
        self.assertTrue(rest_conn_b.monitorRebalance(),
            msg="rebalance operation failed after adding node on cluster 2")

        # Wait for loading threads to finish
        for lt in load_thread_list:
            lt.join()
        self.log.info("All loading threads finished")

        # Verify replication
        self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b,
            self._buckets[0],
            kvstore,
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated data failed")
        self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a,
            rest_conn_b,
            self._buckets[0],
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated revisions failed")
Esempio n. 18
0
    def _common_test_body_swap_rebalance(self, do_stop_start=False):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)

        self.log.info("DATA LOAD PHASE")
        loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = ClusterHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            self.log.info("STOP/START SWAP REBALANCE PHASE")
            retry = 0
            for expected_progress in (20, 40, 60):
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance is already reached")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        time.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        time.sleep(1)
                #self.assertTrue(reached, "rebalance failed or did not reach {0}%".format(expected_progress))
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))

        # Stop loaders
        SwapRebalanceBase.stop_load(loaders)

        self.log.info("DONE DATA ACCESS PHASE")
        #for bucket in rest.get_buckets():
        #    SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\
        #        bucket.name, self)
        #RebalanceHelper.wait_for_persistence(master, bucket.name)

        self.log.info("VERIFICATION PHASE")
        SwapRebalanceBase.items_verification(master, self)
Esempio n. 19
0
    def _common_test_body(self):
        master = self.servers[0]
        rest = RestConnection(master)

        # start load, max_ops_per_second is the combined limit for all buckets
        buckets = rest.get_buckets()
        loaders = []
        self.log.info("max-ops-per-second per bucket: {0}".format(self.max_ops_per_second / len(buckets)))
        for bucket in buckets:
            loader = {}
            loader["mcsoda"] = LoadWithMcsoda(master, self.keys_count, prefix='', bucket=bucket.name,
                password=bucket.saslPassword, protocol='membase-binary')
            loader["mcsoda"].cfg["max-ops"] = 0
            loader["mcsoda"].cfg["max-ops-per-sec"] = self.max_ops_per_second / len(buckets)
            loader["mcsoda"].cfg["exit-after-creates"] = 0
            loader["mcsoda"].cfg["min-value-size"] = self.min_item_size
            loader["mcsoda"].cfg["json"] = 0
            loader["mcsoda"].cfg["batch"] = 100
            loader["thread"] = Thread(target=loader["mcsoda"].load_data, name='mcloader_' + bucket.name)
            loader["thread"].daemon = True
            loaders.append(loader)

        for loader in loaders:
            loader["thread"].start()

        for iteration in range(self.repeat):
            for server in self.servers[1:]:
                self.log.info("iteration {0}: ".format(iteration))
                self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
                self.log.info("adding node {0} and rebalance afterwards".format(server.ip))

                rebalance_done = False
                rebalance_try = 0
                while not rebalance_done:
                    try:
                        ClusterOperationHelper.begin_rebalance_in(master, [server])
                        ClusterOperationHelper.end_rebalance(master)
                        rebalance_done = True
                    except AssertionError as e:
                        rebalance_try += 1
                        self.log.error(e)
                        time.sleep(5)
                        if rebalance_try > 5:
                            raise e

            for server in self.servers[1:]:
                self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
                self.log.info("removing node {0} and rebalance afterwards".format(server.ip))

                rebalance_done = False
                rebalance_try = 0
                while not rebalance_done:
                    try:
                        ClusterOperationHelper.begin_rebalance_out(master, [server])
                        ClusterOperationHelper.end_rebalance(master)
                        rebalance_done = True
                    except AssertionError as e:
                        rebalance_try += 1
                        self.log.error(e)
                        time.sleep(5)
                        if rebalance_try > 5:
                            raise e

        # stop load
        for loader in loaders:
            loader["mcsoda"].load_stop()

        for loader in loaders:
            loader["thread"].join()
Esempio n. 20
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)

        self.log.info("DATA LOAD PHASE")
        loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        # Rebalance is failed at 20%, 40% and 60% completion
        for i in [1, 2, 3]:
            expected_progress = 20 * i
            self.log.info(
                "FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress))
            RestHelper(rest).rebalance_reached(expected_progress)
            bucket = rest.get_buckets()[0].name
            pid = StatsCommon.get_stats([master], bucket, "", "pid")[master]
            command = "os:cmd(\"kill -9 {0} \")".format(pid)
            self.log.info(command)
            killed = rest.diag_eval(command)
            self.log.info("killed {0}:{1}??  {2} ".format(
                master.ip, master.port, killed))
            BaseTestCase._wait_warmup_completed(self, [master],
                                                bucket,
                                                wait_time=600)
            time.sleep(5)

            rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                           ejectedNodes=optNodesIds)

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                toBeEjectedNodes))

        # Stop loaders
        SwapRebalanceBase.stop_load(loaders)

        self.log.info("DONE DATA ACCESS PHASE")
        #for bucket in rest.get_buckets():
        #    SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\
        #        bucket.name, self)
        #    RebalanceHelper.wait_for_persistence(master, bucket.name)

        self.log.info("VERIFICATION PHASE")
        SwapRebalanceBase.items_verification(master, self)
Esempio n. 21
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)

        # Rebalance is failed at 20%, 40% and 60% completion
        for i in [1, 2, 3]:
            expected_progress = 20 * i
            self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress))
            RestHelper(rest).rebalance_reached(expected_progress)
            bucket = rest.get_buckets()[0].name
            pid = None
            if self.swap_orchestrator:
                # get PID via remote connection if master is a new node
                shell = RemoteMachineShellConnection(master)
                o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
                pid = o[0]
                shell.disconnect()
            else:
                for i in xrange(2):
                    try:
                        _mc = MemcachedClientHelper.direct_client(master, bucket)
                        pid = _mc.stats()["pid"]
                        break
                    except EOFError as e:
                        self.log.error("{0}.Retry in 2 sec".format(e))
                        time.sleep(1)
            if pid is None:
                self.fail("impossible to get a PID")
            command = "os:cmd(\"kill -9 {0} \")".format(pid)
            self.log.info(command)
            killed = rest.diag_eval(command)
            self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
            self.log.info("sleep for 10 sec after kill memcached")
            time.sleep(10)
            # we can't get stats for new node when rebalance falls
            if not self.swap_orchestrator:
                ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
            i = 0
            #we expect that rebalance will be failed
            while rest._rebalance_progress_status() == "running" and i < 60:
                self.log.info("rebalance progress: {0}".format(rest._rebalance_progress()))
                time.sleep(1)
                i += 1
            self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status()))
            knownNodes = rest.node_statuses();
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                ejectedNodes=ejectedNodes)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))

        SwapRebalanceBase.verification_phase(self, master)