Ejemplo n.º 1
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, _ = RebalanceHelper.rebalance_in(intial_severs,
                                                 len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)
        self.sleep(10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(
            self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached and RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            rest.print_UI_logs()
            self.fail("rebalance failed even before killing memcached")
        bucket = self.bucket_util.buckets[0]
        pid = None
        if self.swap_orchestrator and not self.cluster_run:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            pid = shell.get_memcache_pid()
            shell.disconnect()
        else:
            times = 2
            if self.cluster_run:
                times = 20
            for _ in xrange(times):
                try:
                    shell = RemoteMachineShellConnection(server)
                    pid = shell.get_memcache_pid()
                    shell.disconnect()
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    self.sleep(2)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port,
                                                      killed))
        self.log.info("sleep for 10 sec after kill memcached")
        self.sleep(10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            self.bucket_util._wait_warmup_completed([master],
                                                    bucket,
                                                    wait_time=600)
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            self.sleep(30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(),
                             msg="cluster need rebalance")
            knownNodes = rest.node_statuses()
            self.log.info("nodes are still in cluster: {0}".format([
                (node.ip, node.port) for node in knownNodes
            ]))
            ejectedNodes = list(
                set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                           ejectedNodes=ejectedNodes)
            self.assertTrue(
                rest.monitorRebalance(),
                msg="Rebalance failed after adding node {0}".format(
                    toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        self.verification_phase()
Ejemplo n.º 2
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers : num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds)
        SwapRebalanceBase.sleep(self, 10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached and RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error("seems rebalance failed!")
            rest.print_UI_logs()
            self.fail("rebalance failed even before killing memcached")
        bucket = rest.get_buckets()[0].name
        pid = None
        if self.swap_orchestrator and not self.cluster_run:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            pid = shell.get_memcache_pid()
            shell.disconnect()
        else:
            times = 2
            if self.cluster_run:
                times = 20
            for i in xrange(times):
                try:
                    _mc = MemcachedClientHelper.direct_client(master, bucket)
                    pid = _mc.stats()["pid"]
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    SwapRebalanceBase.sleep(self, 2)
        if pid is None:
            self.fail("impossible to get a PID")
        command = 'os:cmd("kill -9 {0} ")'.format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
        self.log.info("sleep for 10 sec after kill memcached")
        SwapRebalanceBase.sleep(self, 10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
        i = 0
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            SwapRebalanceBase.sleep(self, 30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance")
            knownNodes = rest.node_statuses()
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes)
            self.assertTrue(
                rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)
            )
        else:
            self.log.info("rebalance completed successfully")
        SwapRebalanceBase.verification_phase(self, master)