def rebalance_in_with_failover_full_addback_recovery(self):
     gen_update = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items)
     tasks = []
     tasks += self._async_load_all_buckets(self.master, gen_update, "update", 0)
     for task in tasks:
         task.result()
     servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)]
     self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120)
     self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
     self.sleep(20)
     prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets)
     prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets)
     disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(self.servers[:self.nodes_init], self.buckets, path=None)
     self.rest = RestConnection(self.master)
     self.nodes = self.get_nodes(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
     # Mark Node for full recovery
     if success_failed_over:
         self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType="full")
     rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], servs_in, [])
     rebalance.result()
     self._verify_stats_all_buckets(self.servers[:self.nodes_in + self.nodes_init], timeout=120)
     self.verify_cluster_stats(self.servers[:self.nodes_in + self.nodes_init], check_ep_items_remaining = True)
     self.compare_failovers_logs(prev_failover_stats, self.servers[:self.nodes_in + self.nodes_init], self.buckets)
     self.sleep(30)
     self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, self.servers[:self.nodes_in + self.nodes_init], self.buckets, path=None)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 1.0 , total_vbuckets = self.total_vbuckets)
 def rebalance_in_out_with_failover(self):
     fail_over = self.input.param("fail_over", False)
     gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items)
     self._load_all_buckets(self.master, gen, "create", 0)
     tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
     servs_in = self.servers[self.nodes_init:self.nodes_init + 1]
     servs_out = self.servers[self.nodes_init - 1:self.nodes_init]
     for task in tasks:
         task.result(self.wait_timeout * 20)
     self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120)
     self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
     self.sleep(20)
     ejectedNode = self.find_node_info(self.master,self.servers[self.nodes_init-1])
     prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets)
     prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets)
     disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(self.servers[:self.nodes_init], self.buckets, path=None)
     self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats)
     self.rest = RestConnection(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     result_nodes = self.add_remove_servers(self.servers,self.servers[:self.nodes_init],[self.servers[self.nodes_init-1],chosen[0]],[self.servers[self.nodes_init]])
     self.rest.add_node(self.master.rest_username, self.master.rest_password,self.servers[self.nodes_init].ip,self.servers[self.nodes_init].port)
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over)
     self.nodes = self.rest.node_statuses()
     self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[chosen[0].id,ejectedNode.id])
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed")
     self._wait_for_stats_all_buckets(result_nodes)
     self.sleep(10)
     self.verify_cluster_stats(result_nodes)
     self.compare_failovers_logs(prev_failover_stats, result_nodes, self.buckets)
     self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets, path=None)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers = nodes, std = 1.0 , total_vbuckets = self.total_vbuckets)
Esempio n. 3
0
    def _failover_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        num_initial_servers = self.num_initial_servers
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds[0] = content

        self.log.info("FAILOVER PHASE")
        # Failover selected nodes
        for node in optNodesIds:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.fail_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(new_swap_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 4
0
    def test_rebalance_in_out_with_failover_addback_recovery(self):
        """
        Rebalances nodes out and in with failover and full/delta recovery add back of a node
        Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone'
        param to have nodes divided into server groups by having zone > 1.

        This test begins by loading a given number of items into the cluster. It then
        removes one node, rebalances that node out the cluster, and then rebalances it back
        in. During the rebalancing we update all of the items in the cluster. Once the
        node has been removed and added back we  wait for the disk queues to drain, and
        then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
        We then remove and add back two nodes at a time and so on until we have reached the point
        where we are adding back and removing at least half of the nodes.
        """
        recovery_type = self.input.param("recoveryType", "full")
        gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in]
        servs_out = self.servers[self.nodes_init - self.nodes_out:self.nodes_init]
        for task in tasks:
            task.result(self.wait_timeout * 20)
        self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats)
        self.rest = RestConnection(self.master)
        self.nodes = self.get_nodes(self.master)
        result_nodes = list(set(self.servers[:self.nodes_init] + servs_in) - set(servs_out))
        for node in servs_in:
            self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        # Mark Node for failover
        self.sleep(30)
        success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
        # Mark Node for full recovery
        if success_failed_over:
            self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType=recovery_type)
        self.sleep(30)
        self.shuffle_nodes_between_zones_and_rebalance(servs_out)
        self._verify_stats_all_buckets(result_nodes, timeout=120)
        self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True)
        self.compare_failovers_logs(prev_failover_stats, result_nodes, self.buckets)
        self.sleep(30)
        self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets,
                                              path=None)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets)
    def test_start_stop_rebalance_after_failover(self):
        """
            Rebalances nodes out and in with failover
            Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone'
            param to have nodes divided into server groups by having zone > 1.

            The test begin with loading the bucket with given number of items. It then fails over a node. We then
            rebalance the cluster, while adding or removing given number of nodes. Once the rebalance reaches 50%,
            we stop the rebalance and validate the cluster stats. We then restart the rebalance and validate rebalance
            was completed successfully.
            """
        fail_over = self.input.param("fail_over", False)
        gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        for task in tasks:
            task.result(self.wait_timeout * 20)
        self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats)
        self.rest = RestConnection(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        result_nodes = list(set(self.servers[:self.nodes_init] + self.servs_in) - set(self.servs_out))
        for node in self.servs_in:
            self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port)
        # Mark Node for failover
        self.rest.fail_over(chosen[0].id, graceful=fail_over)
        rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], self.servs_in, self.servs_out)
        expected_progress = 50
        rest = RestConnection(self.master)
        reached = RestHelper(rest).rebalance_reached(expected_progress)
        self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(expected_progress))
        if not RestHelper(rest).is_cluster_rebalanced():
            self.log.info("Stop the rebalance")
            stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3)
            self.assertTrue(stopped, msg="Unable to stop rebalance")
            self._verify_all_buckets(self.master, timeout=None, max_verify=self.max_verify, batch_size=1)
        self.shuffle_nodes_between_zones_and_rebalance()
        self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True, check_bucket_stats=False)
        self.sleep(30)
        self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets,
                                              path=None)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets)
Esempio n. 6
0
 def rebalance_out_with_failover(self):
     fail_over = self.input.param("fail_over", False)
     self.rest = RestConnection(self.master)
     gen_delete = BlobGenerator("mike", "mike-", self.value_size, start=self.num_items / 2, end=self.num_items)
     gen_create = BlobGenerator(
         "mike", "mike-", self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2
     )
     # define which doc's ops will be performed during rebalancing
     # allows multiple of them but one by one
     tasks = []
     if self.doc_ops is not None:
         if "update" in self.doc_ops:
             tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0)
         if "create" in self.doc_ops:
             tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0)
         if "delete" in self.doc_ops:
             tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0)
         for task in tasks:
             task.result()
     ejectedNode = self.find_node_info(self.master, self.servers[self.nodes_init - 1])
     self._verify_stats_all_buckets(self.servers[: self.num_servers], timeout=120)
     self._wait_for_stats_all_buckets(self.servers[: self.num_servers])
     self.sleep(20)
     prev_failover_stats = self.get_failovers_logs(self.servers[: self.nodes_init], self.buckets)
     prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[: self.nodes_init], self.buckets)
     record_data_set = self.get_data_set_all(self.servers[: self.nodes_init], self.buckets)
     self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats)
     self.rest = RestConnection(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     new_server_list = self.add_remove_servers(
         self.servers, self.servers[: self.nodes_init], [self.servers[self.nodes_init - 1], chosen[0]], []
     )
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over)
     self.nodes = self.rest.node_statuses()
     self.sleep(20)
     self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id, ejectedNode.id])
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed")
     self._wait_for_stats_all_buckets(new_server_list)
     self.verify_cluster_stats(new_server_list)
     self.data_analysis_all(record_data_set, new_server_list, self.buckets)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
Esempio n. 7
0
 def rebalance_out_with_failover_full_addback_recovery(self):
     gen_delete = BlobGenerator("mike", "mike-", self.value_size, start=self.num_items / 2, end=self.num_items)
     gen_create = BlobGenerator(
         "mike", "mike-", self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2
     )
     # define which doc's ops will be performed during rebalancing
     # allows multiple of them but one by one
     tasks = []
     if self.doc_ops is not None:
         if "update" in self.doc_ops:
             tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0)
         if "create" in self.doc_ops:
             tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0)
         if "delete" in self.doc_ops:
             tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0)
         for task in tasks:
             task.result()
     servs_out = [self.servers[self.num_servers - i - 1] for i in range(self.nodes_out)]
     self._verify_stats_all_buckets(self.servers[: self.num_servers], timeout=120)
     self._wait_for_stats_all_buckets(self.servers[: self.num_servers])
     self.rest = RestConnection(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     self.sleep(20)
     prev_failover_stats = self.get_failovers_logs(self.servers[: self.num_servers], self.buckets)
     prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[: self.num_servers], self.buckets)
     record_data_set = self.get_data_set_all(self.servers[: self.num_servers], self.buckets)
     self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats)
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
     # Mark Node for full recovery
     if success_failed_over:
         self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType="full")
     rebalance = self.cluster.async_rebalance(self.servers[:1], [], servs_out)
     rebalance.result()
     self._wait_for_stats_all_buckets(self.servers[: self.num_servers - self.nodes_out])
     self.verify_cluster_stats(self.servers[: self.num_servers - self.nodes_out])
     self.compare_failovers_logs(
         prev_failover_stats, self.servers[: self.num_servers - self.nodes_out], self.buckets
     )
     self.data_analysis_all(record_data_set, self.servers[: self.num_servers - self.nodes_out], self.buckets)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
Esempio n. 8
0
    def rebalance_in_with_failover(self):
        fail_over = self.input.param("fail_over", False)
        gen_update = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items)
        tasks = []
        tasks += self._async_load_all_buckets(self.master, gen_update, "update", 0)
        for task in tasks:
            task.result()
        servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)]
        self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets)
        prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(self.servers[:self.nodes_init], self.buckets, path=None)
        self.rest = RestConnection(self.master)
        self.nodes = self.get_nodes(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        self.rest = RestConnection(self.master)
        self.rest.add_node(self.master.rest_username, self.master.rest_password,self.servers[self.nodes_init].ip,self.servers[self.nodes_init].port)
        # Mark Node for failover
        self.rest.fail_over(chosen[0].id, graceful=fail_over)
        if fail_over:
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Graceful Failover Failed")
        self.nodes = self.rest.node_statuses()
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[chosen[0].id])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance Failed")

        # Verification
        new_server_list = self.add_remove_servers(self.servers,self.servers[:self.nodes_init],[chosen[0]],[self.servers[self.nodes_init]])
        self._wait_for_stats_all_buckets(new_server_list)
        self._verify_stats_all_buckets(new_server_list, timeout=120)
        self.verify_cluster_stats(new_server_list)
        self.compare_failovers_logs(prev_failover_stats, new_server_list, self.buckets)
        self.sleep(30)
        self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, new_server_list, self.buckets, path=None)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 1.0 , total_vbuckets = self.total_vbuckets)
Esempio n. 9
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs)-1)

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        self.log.info("DATA LOAD PHASE")
        loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers+self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\
            ejectedNodes=optNodesIds)

        # Rebalance is failed at 20%, 40% and 60% completion
        for i in [1, 2, 3]:
            expected_progress = 20*i
            self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress))
            reached = RestHelper(rest).rebalance_reached(expected_progress)
            command = "[erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)]."
            memcached_restarted = rest.diag_eval(command)
            self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval")
            time.sleep(20)

            rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\
                ejectedNodes=optNodesIds)

        # Stop loaders
        SwapRebalanceBase.stop_load(loaders)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))

        self.log.info("DONE DATA ACCESS PHASE")
        #for bucket in rest.get_buckets():
        #    SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\
        #        bucket.name, self)
        #    RebalanceHelper.wait_for_persistence(master, bucket.name)

        self.log.info("VERIFICATION PHASE")
        SwapRebalanceBase.items_verification(master, self)
Esempio n. 10
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            # let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    api = rest.baseUrl + 'nodeStatuses'
                    status, content, header = rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                # try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("60 seconds sleep after failover before invoking rebalance...")
            time.sleep(60)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            if self.during_ops:
                self.sleep(5, "Wait for some progress in rebalance")
                if self.during_ops == "change_password":
                    old_pass = self.master.rest_password
                    self.change_password(new_password=self.input.param("new_password", "new_pass"))
                    rest = RestConnection(self.master)
                elif self.during_ops == "change_port":
                    self.change_port(new_port=self.input.param("new_port", "9090"))
                    rest = RestConnection(self.master)
            try:
                msg = "rebalance failed while removing failover nodes {0}".format(chosen)
                self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
                for failed in chosen:
                    for server in _servers_:
                        if server.ip == failed.ip:
                             _servers_.remove(server)
                             self._cleanup_nodes.append(server)

                log.info("Begin VERIFICATION ...")
                RebalanceHelper.wait_for_replication(_servers_, self.cluster)
                self.verify_cluster_stats(_servers_, self.master)
            finally:
                if self.during_ops:
                     if self.during_ops == "change_password":
                         self.change_password(new_password=old_pass)
                     elif self.during_ops == "change_port":
                         self.change_port(new_port='8091',
                                          current_port=self.input.param("new_port", "9090"))
    def _failover_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        num_initial_servers = self.num_initial_servers
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(
            intial_severs,
            len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        self.log.info("current nodes : {0}".format(
            RebalanceHelper.getOtpNodeIds(master)))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(
            master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds[0] = content

        self.log.info("FAILOVER PHASE")
        # Failover selected nodes
        for node in optNodesIds:
            self.log.info(
                "failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.failover_factor]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.fail_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                new_swap_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 12
0
    def _common_test_body_swap_rebalance(self, do_stop_start=False):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(
            intial_severs,
            len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        SwapRebalanceBase.sleep(self, 20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        SwapRebalanceBase.sleep(self, 1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 13
0
    def _add_back_failed_node(self, do_node_cleanup=False):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        # List of servers that will not be failed over
        not_failed_over = []
        for server in self.servers:
            if server.ip not in [node.ip for node in toBeEjectedNodes]:
                not_failed_over.append(server)
                self.log.info("Node %s not failed over" % server.ip)

        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content
            master = not_failed_over[-1]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        # Failover selected nodes
        for node in optNodesIds:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(optNodesIds))

        # Add back the same failed over nodes

        # Cleanup the node, somehow
        # TODO: cluster_run?
        if do_node_cleanup:
            pass

        # Make rest connection with node part of cluster
        rest = RestConnection(master)

        # Given the optNode, find ip
        add_back_servers = []
        nodes = rest.get_nodes()
        for server in [node.ip for node in nodes]:
            if isinstance(server, unicode):
                add_back_servers.append(server)
        final_add_back_servers = []
        for server in self.servers:
            if server.ip not in add_back_servers:
                final_add_back_servers.append(server)

        for server in final_add_back_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[])

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(add_back_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 14
0
    def _common_test_body_swap_rebalance(self, do_stop_start=False):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)

        if do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info("STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                              format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error("rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info("Rebalance will be stopped with {0}%".format(progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped, msg="unable to stop rebalance")
                        SwapRebalanceBase.sleep(self, 20)
                        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        SwapRebalanceBase.sleep(self, 1)
        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(optNodesIds))
        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 15
0
    def test_failover_continuous_bidirectional_sets_deletes(self):
        cluster_ref_a = "cluster_ref_a"
        master_a = self._input.clusters.get(0)[0]
        rest_conn_a = RestConnection(master_a)

        cluster_ref_b = "cluster_ref_b"
        master_b = self._input.clusters.get(1)[0]
        rest_conn_b = RestConnection(master_b)

        # Rebalance all the nodes together
        servers_a = self._input.clusters.get(0)
        servers_b = self._input.clusters.get(1)
        rebalanced_servers_a = []
        rebalanced_servers_b = []

        RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1)
        RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1)
        rebalanced_servers_a.extend(servers_a)
        rebalanced_servers_b.extend(servers_b)

        # Setup bi-directional continuous replication
        replication_type = "continuous"

        rest_conn_a.add_remote_cluster(master_b.ip, master_b.port,
            master_b.rest_username,
            master_b.rest_password, cluster_ref_b)
        rest_conn_b.add_remote_cluster(master_a.ip, master_a.port,
            master_a.rest_username,
            master_a.rest_password, cluster_ref_a)
        (rep_database_a, rep_id_a) = rest_conn_a.start_replication(
            replication_type, self._buckets[0],
            cluster_ref_b)
        (rep_database_b, rep_id_b) = rest_conn_b.start_replication(
            replication_type, self._buckets[0],
            cluster_ref_a)

        load_thread_list = []
        # Start load
        kvstore = ClientKeyValueStore()

        self._params["ops"] = "set"
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread.start()
        load_thread.join()
        RebalanceHelper.wait_for_persistence(master_a, self._buckets[0])

        # Do some deletes
        self._params["ops"] = "delete"
        self._params["count"] = self._num_items/5
        task_def = RebalanceDataGenerator.create_loading_tasks(self._params)
        load_thread = RebalanceDataGenerator.start_load(rest_conn_a,
            self._buckets[0],
            task_def, kvstore)
        load_thread_list.append(load_thread)

        # Start all loads concurrently
        for lt in load_thread_list:
            lt.start()

        # Do the failover of nodes on both clusters
        self.log.info("Failing over nodes")
        self.log.info("current nodes on cluster 1: {0}".format(RebalanceHelper.getOtpNodeIds(master_a)))
        self.log.info("current nodes on cluster 2: {0}".format(RebalanceHelper.getOtpNodeIds(master_b)))

        # Find nodes to be failed_over
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master_a, howmany=self._failover_factor)
        optNodesIds_a = [node.id for node in toBeEjectedNodes]
        if self._fail_orchestrator_a:
            status, content = ClusterOperationHelper.find_orchestrator(master_a)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds_a[0] = content
            master_a = self._input.clusters.get(0)[-1]
            rest_conn_a = RestConnection(master_a)

        #Failover selected nodes
        for node in optNodesIds_a:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest_conn_a.fail_over(node)

        toBeEjectedNodes = RebalanceHelper.pick_nodes(master_b, howmany=self._failover_factor)
        optNodesIds_b = [node.id for node in toBeEjectedNodes]
        if self._fail_orchestrator_b:
            status, content = ClusterOperationHelper.find_orchestrator(master_b)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds_b[0] = content
            master_b = self._input.clusters.get(1)[-1]
            rest_conn_b = RestConnection(master_b)

        self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a))
        self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b))

        #Failover selected nodes
        for node in optNodesIds_b:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest_conn_b.fail_over(node)

        rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()],\
            ejectedNodes=optNodesIds_a)
        rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\
            ejectedNodes=optNodesIds_b)

        self.assertTrue(rest_conn_a.monitorRebalance(),
            msg="rebalance operation failed after adding node on cluster 1")
        self.assertTrue(rest_conn_b.monitorRebalance(),
            msg="rebalance operation failed after adding node on cluster 2")

        # Wait for loading threads to finish
        for lt in load_thread_list:
            lt.join()
        self.log.info("All loading threads finished")

        # Verify replication
        self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b,
            self._buckets[0],
            kvstore,
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated data failed")
        self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a,
            rest_conn_b,
            self._buckets[0],
            self._poll_sleep,
            self._poll_timeout),
            "Verification of replicated revisions failed")
Esempio n. 16
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                RemoteUtilHelper.enable_firewall(self.servers, node, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("30 seconds sleep after failover before invoking rebalance...")
            time.sleep(30)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                         _servers_.remove(server)
                         self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        RebalanceHelper.wait_for_replication(_servers_, self.cluster)
        self.verify_cluster_stats(_servers_, self.master)
Esempio n. 17
0
    def _add_back_failed_node(self, do_node_cleanup=False):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(
            self.servers,
            len(self.servers) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(
            master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        # List of servers that will not be failed over
        not_failed_over = []
        for server in self.servers:
            if self.cluster_run:
                if server.port not in [node.port for node in toBeEjectedNodes]:
                    not_failed_over.append(server)
                    self.log.info("Node {0}:{1} not failed over".format(
                        server.ip, server.port))
            else:
                if server.ip not in [node.ip for node in toBeEjectedNodes]:
                    not_failed_over.append(server)
                    self.log.info("Node {0}:{1} not failed over".format(
                        server.ip, server.port))

        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
                format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content
            master = not_failed_over[-1]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        # Failover selected nodes
        for node in optNodesIds:
            self.log.info(
                "failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))

        # Add back the same failed over nodes

        # Cleanup the node, somehow
        # TODO: cluster_run?
        if do_node_cleanup:
            pass

        # Make rest connection with node part of cluster
        rest = RestConnection(master)

        # Given the optNode, find ip
        add_back_servers = []
        nodes = rest.get_nodes()
        for server in nodes:
            if isinstance(server.ip, unicode):
                add_back_servers.append(server)
        final_add_back_servers = []
        for server in self.servers:
            if self.cluster_run:
                if server.port not in [serv.port for serv in add_back_servers]:
                    final_add_back_servers.append(server)
            else:
                if server.ip not in [serv.ip for serv in add_back_servers]:
                    final_add_back_servers.append(server)
        for server in final_add_back_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=[])

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                add_back_servers))

        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 18
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(
            intial_severs,
            len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)
        SwapRebalanceBase.sleep(self, 10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(
            self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached and RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            rest.print_UI_logs()
            self.fail("rebalance failed even before killing memcached")
        bucket = rest.get_buckets()[0].name
        pid = None
        if self.swap_orchestrator and not self.cluster_run:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            o, _ = shell.execute_command(
                "ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
            pid = o[0]
            shell.disconnect()
        else:
            times = 2
            if self.cluster_run:
                times = 20
            for i in xrange(times):
                try:
                    _mc = MemcachedClientHelper.direct_client(master, bucket)
                    pid = _mc.stats()["pid"]
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    SwapRebalanceBase.sleep(self, 2)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port,
                                                      killed))
        self.log.info("sleep for 10 sec after kill memcached")
        SwapRebalanceBase.sleep(self, 10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            ClusterOperationHelper._wait_warmup_completed(self, [master],
                                                          bucket,
                                                          wait_time=600)
        i = 0
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            SwapRebalanceBase.sleep(self, 30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(),
                             msg="cluster need rebalance")
            knownNodes = rest.node_statuses()
            self.log.info("nodes are still in cluster: {0}".format([
                (node.ip, node.port) for node in knownNodes
            ]))
            ejectedNodes = list(
                set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                           ejectedNodes=ejectedNodes)
            self.assertTrue(
                rest.monitorRebalance(),
                msg="rebalance operation failed after adding node {0}".format(
                    toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 19
0
 def rebalance_out_with_failover(self):
     fail_over = self.input.param("fail_over", False)
     self.rest = RestConnection(self.master)
     gen_delete = BlobGenerator('mike',
                                'mike-',
                                self.value_size,
                                start=self.num_items / 2,
                                end=self.num_items)
     gen_create = BlobGenerator('mike',
                                'mike-',
                                self.value_size,
                                start=self.num_items + 1,
                                end=self.num_items * 3 / 2)
     # define which doc's ops will be performed during rebalancing
     # allows multiple of them but one by one
     tasks = []
     if (self.doc_ops is not None):
         if ("update" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master,
                                                   self.gen_update,
                                                   "update", 0)
         if ("create" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master, gen_create,
                                                   "create", 0)
         if ("delete" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                   "delete", 0)
         for task in tasks:
             task.result()
     ejectedNode = self.find_node_info(self.master,
                                       self.servers[self.nodes_init - 1])
     self._verify_stats_all_buckets(self.servers[:self.num_servers],
                                    timeout=120)
     self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
     self.sleep(20)
     prev_failover_stats = self.get_failovers_logs(
         self.servers[:self.nodes_init], self.buckets)
     prev_vbucket_stats = self.get_vbucket_seqnos(
         self.servers[:self.nodes_init], self.buckets)
     record_data_set = self.get_data_set_all(self.servers[:self.nodes_init],
                                             self.buckets)
     self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                          prev_failover_stats)
     self.rest = RestConnection(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     new_server_list = self.add_remove_servers(
         self.servers, self.servers[:self.nodes_init],
         [self.servers[self.nodes_init - 1], chosen[0]], [])
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id,
                                               graceful=fail_over)
     self.nodes = self.rest.node_statuses()
     self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                         ejectedNodes=[chosen[0].id, ejectedNode.id])
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                     msg="Rebalance failed")
     self.verify_cluster_stats(new_server_list,
                               check_ep_items_remaining=True)
     self.sleep(30)
     self.data_analysis_all(record_data_set, new_server_list, self.buckets)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers=nodes,
                                   buckets=self.buckets,
                                   std=1.0,
                                   total_vbuckets=self.total_vbuckets)
Esempio n. 20
0
    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case (before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info(" Picking node {0} as reference node for test case".format(self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=1, target_node = self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform View Creation Tasks and check for completion if required before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validaiton
        record_static_data_set ={}
        prev_vbucket_stats = {}
        prev_failover_stats = {}
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets)

        # Perform Operations relalted to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
        else:
            return

        if self.during_ops == None:
            self.verify_unacked_bytes_all_buckets(filter_list = self.filter_list, master_node = self.master)
Esempio n. 21
0
    def common_test_body(self, keys_count, replica, load_ratio, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replica : {0}".format(replica))
        log.info("load_ratio : {0}".format(load_ratio))
        log.info("failover_reason : {0}".format(failover_reason))
        master = self._servers[0]
        log.info('picking server : {0} as the master'.format(master))
        rest = RestConnection(master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=master.rest_username,
                          password=master.rest_password)
        rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved)
        bucket_ram = info.memoryQuota * 2 / 3
        bucket = 'default'
        rest.create_bucket(bucket=bucket,
                           ramQuotaMB=bucket_ram,
                           replicaNumber=replica,
                           proxyPort=info.moxi)
        ready = BucketOperationHelper.wait_for_memcached(master, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        credentials = self._input.membase_settings

        ClusterOperationHelper.add_all_nodes_or_assert(master, self._servers, credentials, self)
        nodes = rest.node_statuses()
        rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[])
        msg = "rebalance failed after adding these nodes {0}".format(nodes)
        self.assertTrue(rest.monitorRebalance(), msg=msg)

        inserted_keys = FailoverBaseTest.load_data(master, bucket, keys_count, load_ratio)
        inserted_count = len(inserted_keys)
        log.info('inserted {0} keys'.format(inserted_count))

        nodes = rest.node_statuses()
        while (len(nodes) - replica) > 1:
            final_replication_state = RestHelper(rest).wait_for_replication(900)
            msg = "replication state after waiting for up to 15 minutes : {0}"
            self.log.info(msg.format(final_replication_state))
            chosen = RebalanceHelper.pick_nodes(master, howmany=replica)
            for node in chosen:
                #let's do op
                if failover_reason == 'stop_server':
                    self.stop_server(node)
                    log.info("10 seconds delay to wait for membase-server to shutdown")
                    #wait for 5 minutes until node is down
                    self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
                elif failover_reason == "firewall":
                    RemoteUtilHelper.enable_firewall(self._servers, node, bidirectional=self.bidirectional)
                    self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")

                failed_over = rest.fail_over(node.id)
                if not failed_over:
                    self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                    #try again in 60 seconds
                    time.sleep(75)
                    failed_over = rest.fail_over(node.id)
                self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
                log.info("failed over node : {0}".format(node.id))
            #REMOVEME -
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(), msg=msg)
            FailoverBaseTest.replication_verification(master, bucket, replica, inserted_count, self)

            nodes = rest.node_statuses()
        FailoverBaseTest.verify_data(master, inserted_keys, bucket, self)
Esempio n. 22
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)
        SwapRebalanceBase.sleep(self, 10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached == 100 and not RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            self.log.info("Latest logs from UI:")
            for i in rest.get_logs(): self.log.error(i)
            self.fail("rebalance failed even before killing memcached")
        bucket = rest.get_buckets()[0].name
        pid = None
        if self.swap_orchestrator:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
            pid = o[0]
            shell.disconnect()
        else:
            for i in xrange(2):
                try:
                    _mc = MemcachedClientHelper.direct_client(master, bucket)
                    pid = _mc.stats()["pid"]
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    SwapRebalanceBase.sleep(self, 1)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
        self.log.info("sleep for 10 sec after kill memcached")
        SwapRebalanceBase.sleep(self, 10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
        i = 0
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            SwapRebalanceBase.sleep(self, 30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance")
            knownNodes = rest.node_statuses();
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes)
            self.assertTrue(rest.monitorRebalance(),
                            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        SwapRebalanceBase.verification_phase(self, master)
Esempio n. 23
0
    def test_rebalance_in_out_with_failover(self):
        """
        Rebalances nodes out and in with failover
        Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone'
        param to have nodes divided into server groups by having zone > 1.

        This test begins by loading a given number of items into the cluster. It then
        removes one node, rebalances that node out the cluster, and then rebalances it back
        in. During the rebalancing we update all of the items in the cluster. Once the
        node has been removed and added back we  wait for the disk queues to drain, and
        then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
        We then remove and add back two nodes at a time and so on until we have reached the point
        where we are adding back and removing at least half of the nodes.
        """
        fail_over = self.input.param("fail_over", False)
        gen = BlobGenerator('mike',
                            'mike-',
                            self.value_size,
                            end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        servs_in = self.servers[self.nodes_init:self.nodes_init +
                                self.nodes_in]
        servs_out = self.servers[self.nodes_init -
                                 self.nodes_out:self.nodes_init]
        for task in tasks:
            task.result(self.wait_timeout * 20)
        if self.flusher_total_batch_limit is not None:
            # Validate seq_no snap_start/stop values after initial doc_load
            self.check_snap_start_corruption()

        self._verify_stats_all_buckets(self.servers[:self.nodes_init],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        if self.flusher_total_batch_limit is None:
            self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                                 prev_failover_stats)
        self.rest = RestConnection(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        result_nodes = list(
            set(self.servers[:self.nodes_init] + servs_in) - set(servs_out))
        for node in servs_in:
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password, node.ip, node.port)
        if self.flusher_total_batch_limit is not None:
            # Load data after add-node
            self._load_all_buckets(self.master, gen, "update", 0)
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        # Mark Node for failover
        self.rest.fail_over(chosen[0].id, graceful=fail_over)
        self.wait_for_failover_or_assert(expected_failover_count=1)
        # Load data after failover
        self._load_all_buckets(self.master, gen, "update", 0)
        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # No need to pass self.sleep_before_rebalance,
        # since prev ops are synchronous call
        self.shuffle_nodes_between_zones_and_rebalance(servs_out)
        if self.flusher_total_batch_limit is not None:
            # Validate seq_no snap_start/stop values after rebalance
            self.check_snap_start_corruption()

        self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True)
        if self.flusher_total_batch_limit is None:
            self.compare_failovers_logs(prev_failover_stats, result_nodes,
                                        self.buckets)
            self.sleep(30)
        self.data_analysis_active_replica_all(disk_active_dataset,
                                              disk_replica_dataset,
                                              result_nodes,
                                              self.buckets,
                                              path=None)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)
Esempio n. 24
0
 def rebalance_out_with_failover_full_addback_recovery(self):
     gen_delete = BlobGenerator('mike',
                                'mike-',
                                self.value_size,
                                start=self.num_items / 2,
                                end=self.num_items)
     gen_create = BlobGenerator('mike',
                                'mike-',
                                self.value_size,
                                start=self.num_items + 1,
                                end=self.num_items * 3 / 2)
     # define which doc's ops will be performed during rebalancing
     # allows multiple of them but one by one
     tasks = []
     if (self.doc_ops is not None):
         if ("update" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master,
                                                   self.gen_update,
                                                   "update", 0)
         if ("create" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master, gen_create,
                                                   "create", 0)
         if ("delete" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                   "delete", 0)
         for task in tasks:
             task.result()
     servs_out = [
         self.servers[self.num_servers - i - 1]
         for i in range(self.nodes_out)
     ]
     self._verify_stats_all_buckets(self.servers[:self.num_servers],
                                    timeout=120)
     self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
     self.rest = RestConnection(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     self.sleep(20)
     prev_failover_stats = self.get_failovers_logs(
         self.servers[:self.num_servers], self.buckets)
     prev_vbucket_stats = self.get_vbucket_seqnos(
         self.servers[:self.num_servers], self.buckets)
     record_data_set = self.get_data_set_all(
         self.servers[:self.num_servers], self.buckets)
     self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                          prev_failover_stats)
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
     # Mark Node for full recovery
     if success_failed_over:
         self.rest.set_recovery_type(otpNode=chosen[0].id,
                                     recoveryType="full")
     rebalance = self.cluster.async_rebalance(self.servers[:1], [],
                                              servs_out)
     rebalance.result()
     self.verify_cluster_stats(self.servers[:self.num_servers -
                                            self.nodes_out],
                               check_ep_items_remaining=True)
     self.compare_failovers_logs(
         prev_failover_stats,
         self.servers[:self.num_servers - self.nodes_out], self.buckets)
     self.sleep(30)
     self.data_analysis_all(
         record_data_set, self.servers[:self.num_servers - self.nodes_out],
         self.buckets)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers=nodes,
                                   buckets=self.buckets,
                                   std=1.0,
                                   total_vbuckets=self.total_vbuckets)
Esempio n. 25
0
    def test_rebalance_in_out_with_failover_addback_recovery(self):
        """
        Rebalances nodes out and in with failover and full/delta recovery add back of a node
        Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone'
        param to have nodes divided into server groups by having zone > 1.

        This test begins by loading a given number of items into the cluster. It then
        removes one node, rebalances that node out the cluster, and then rebalances it back
        in. During the rebalancing we update all of the items in the cluster. Once the
        node has been removed and added back we  wait for the disk queues to drain, and
        then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
        We then remove and add back two nodes at a time and so on until we have reached the point
        where we are adding back and removing at least half of the nodes.
        """
        recovery_type = self.input.param("recoveryType", "full")
        gen = BlobGenerator('mike',
                            'mike-',
                            self.value_size,
                            end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        servs_in = self.servers[self.nodes_init:self.nodes_init +
                                self.nodes_in]
        servs_out = self.servers[self.nodes_init -
                                 self.nodes_out:self.nodes_init]
        for task in tasks:
            task.result(self.wait_timeout * 20)
        self._verify_stats_all_buckets(self.servers[:self.nodes_init],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        # self.sleep(20)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        self.rest = RestConnection(self.master)
        self.nodes = self.get_nodes(self.master)
        result_nodes = list(
            set(self.servers[:self.nodes_init] + servs_in) - set(servs_out))
        for node in servs_in:
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password, node.ip, node.port)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        # Mark Node for failover
        success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
        self.wait_for_failover_or_assert(expected_failover_count=1)
        # Mark Node for full recovery
        if success_failed_over:
            self.rest.set_recovery_type(otpNode=chosen[0].id,
                                        recoveryType=recovery_type)
        # self.sleep(30)
        try:
            self.shuffle_nodes_between_zones_and_rebalance(servs_out)
        except Exception as e:
            if "deltaRecoveryNotPossible" not in e.__str__():
                self.fail(
                    "Rebalance did not fail. Rebalance has to fail since no delta recovery should be possible"
                    " while adding nodes too")