Ejemplo n.º 1
0
    def test_add_remove_graceful_add_back_node_with_cert(self,recovery_type=None):
        recovery_type = self.input.param('recovery_type')
        rest = RestConnection(self.master)
        known_nodes = ['ns_1@'+self.master.ip]
        progress = None
        count = 0
        servs_inout = self.servers[1:]
        serv_out = 'ns_1@' + servs_inout[1].ip

        rest.create_bucket(bucket='default', ramQuotaMB=100)

        x509main(self.master).setup_master()
        x509main().setup_cluster_nodes_ssl(servs_inout)
        for server in servs_inout:
            rest.add_node('Administrator','password',server.ip)
            known_nodes.append('ns_1@' + server.ip)

        rest.rebalance(known_nodes)
        self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")

        for server in servs_inout:
            status = x509main(server)._validate_ssl_login()
            self.assertEqual(status,200,"Not able to login via SSL code")

        rest.fail_over(serv_out,graceful=True)
        self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
        rest.set_recovery_type(serv_out,recovery_type)
        rest.add_back_node(serv_out)
        rest.rebalance(known_nodes)
        self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")

        for server in servs_inout:
            status = x509main(server)._validate_ssl_login()
            self.assertEqual(status,200,"Not able to login via SSL code")
Ejemplo n.º 2
0
 def test_failover_add_back(self):
     try:
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         self._run_initial_index_tasks()
         failover_task =self.cluster.async_failover([self.master],
                 failover_nodes = servr_out, graceful=self.graceful)
         failover_task.result()
         kvOps_tasks = self._run_kvops_tasks()
         before_index_ops = self._run_before_index_tasks()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if node.ip == failover_node.ip])
         for node in nodes:
             self.log.info(node)
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], [])
         in_between_index_ops = self._run_in_between_tasks()
         rebalance.result()
         self.sleep(120)
         self._run_tasks([kvOps_tasks, before_index_ops, in_between_index_ops])
         self._run_after_index_tasks()
     except Exception, ex:
         raise
Ejemplo n.º 3
0
 def test_failover_add_back(self):
     recoveryType = self.input.param("recoveryType", "full")
     servr_out = self.nodes_out_list
     nodes_all = RestConnection(self.master).node_statuses()
     self.check_and_run_operations(buckets=self.buckets, before=True)
     failover_task = self.cluster.async_failover([self.master],
                                                 failover_nodes=servr_out,
                                                 graceful=self.graceful)
     self.check_and_run_operations(buckets=self.buckets, in_between=True)
     failover_task.result()
     self.log.info(servr_out)
     rest = RestConnection(self.master)
     nodes_all = rest.node_statuses()
     nodes = []
     for failover_node in servr_out:
         nodes.extend([
             node for node in nodes_all if node.ip == failover_node.ip or (
                 node.ip == "127.0.0.1"
                 and str(node.port) != failover_node.port)
         ])
     for node in nodes:
         rest.add_back_node(node.id)
         rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
     rebalance = self.cluster.async_rebalance(
         self.servers[:self.nodes_init], [], [])
     self._run_aync_taks()
     rebalance.result()
     self.check_and_run_operations(buckets=self.buckets, after=True)
Ejemplo n.º 4
0
    def test_capi_with_failover(self):
        repl_id = self._start_es_replication()

        rest_conn = RestConnection(self.src_master)
        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true')

        gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}',  xrange(100), start=0, end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false')

        graceful = self._input.param("graceful", False)
        self.recoveryType = self._input.param("recoveryType", None)
        self.src_cluster.failover(graceful=graceful)

        self.sleep(30)

        if self.recoveryType:
            server_nodes = rest_conn.node_statuses()
            for node in server_nodes:
                if node.ip == self._input.servers[1].ip:
                    rest_conn.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType)
                    self.sleep(30)
                    rest_conn.add_back_node(otpNode=node.id)
            rebalance = self.cluster.async_rebalance(self.src_cluster.get_nodes(), [], [])
            rebalance.result()

        self._verify_es_results()
Ejemplo n.º 5
0
 def test_failover_add_back(self):
     try:
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         nodes_all = rest.node_statuses()
         tasks = self.async_run_operations(buckets=self.buckets, phase="before")
         for task in tasks:
             task.result()
         failover_task = self.cluster.async_failover([self.master],
                 failover_nodes=servr_out, graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if node.ip == failover_node.ip])
         for node in nodes:
             self.log.info(node)
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], [])
         self._run_aync_tasks()
         rebalance.result()
         self.run_after_operations()
     except Exception as ex:
         raise
Ejemplo n.º 6
0
 def test_failover_add_back(self):
     gen_load = BlobGenerator('buckettest', 'buckettest-', self.value_size, start=0, end=self.num_items)
     self._load_all_buckets(self.master, gen_load, "create", 0)
     try:
         for servers in self.servers:
             self.secretmgmt_base_obj.setup_pass_node(servers, self.password)
         self.sleep(30)
         rest = RestConnection(self.master)
         self.graceful = self.input.param('graceful', False)
         recoveryType = self.input.param("recoveryType", "full")
         self.find_nodes_in_list()
         self.generate_map_nodes_out_dist()
         servr_out = self.nodes_out_list
         nodes_all = rest.node_statuses()
         failover_task = self.cluster.async_failover([self.master],
                                                     failover_nodes=servr_out, graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                               if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                               if node.ip == failover_node.ip])
         for node in nodes:
             self.log.info(node)
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], [])
         self.assertTrue(rebalance.result(), "Failover with different servers")
     except Exception as ex:
         raise
Ejemplo n.º 7
0
 def test_failover_add_back(self):
     try:
         self.run_async_data()
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         nodes_all = rest.node_statuses()
         failover_task =self.cluster.async_failover([self.master],
                 failover_nodes = servr_out, graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if node.ip == failover_node.ip])
         for node in nodes:
             self.log.info(node)
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], [])
         self.run_mutation_operations_for_situational_tests()
         self.sleep(120, "Wait for rebalance")
         for t in self.load_thread_list:
             if t.is_alive():
                 if t != None:
                     t.signal = False
     except Exception, ex:
         raise
Ejemplo n.º 8
0
 def test_add_remove_add_back_node_with_cert(self,rebalance=None):
     rebalance = self.input.param('rebalance')
     rest = RestConnection(self.master)
     servs_inout = self.servers[1:3]
     serv_out = 'ns_1@' + servs_inout[1].ip
     known_nodes = ['ns_1@'+self.master.ip]
     x509main(self.master).setup_master()
     x509main().setup_cluster_nodes_ssl(servs_inout)
     for server in servs_inout:
         rest.add_node('Administrator','password',server.ip)
         known_nodes.append('ns_1@' + server.ip)
     rest.rebalance(known_nodes)
     self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
     for server in servs_inout:
         status = x509main(server)._validate_ssl_login()
         self.assertEqual(status,200,"Not able to login via SSL code")
     rest.fail_over(serv_out,graceful=False)
     if (rebalance):
         rest.rebalance(known_nodes,[serv_out])
         self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
         rest.add_node('Administrator','password',servs_inout[1].ip)
     else:
         rest.add_back_node(serv_out)
     rest.rebalance(known_nodes)
     self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
     for server in servs_inout:
         response = x509main(server)._validate_ssl_login()
         self.assertEqual(status,200,"Not able to login via SSL code")
Ejemplo n.º 9
0
    def test_add_remove_graceful_add_back_node_with_cert(self,recovery_type=None):
        recovery_type = self.input.param('recovery_type')
        rest = RestConnection(self.master)
        known_nodes = ['ns_1@'+self.master.ip]
        progress = None
        count = 0
        servs_inout = self.servers[1:]
        serv_out = 'ns_1@' + servs_inout[1].ip

        rest.create_bucket(bucket='default', ramQuotaMB=100)

        x509main(self.master).setup_master()
        x509main().setup_cluster_nodes_ssl(servs_inout)
        for server in servs_inout:
            rest.add_node('Administrator','password',server.ip)
            known_nodes.append('ns_1@' + server.ip)

        rest.rebalance(known_nodes)
        self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")

        for server in servs_inout:
            status = x509main(server)._validate_ssl_login()
            self.assertEqual(status,200,"Not able to login via SSL code")

        rest.fail_over(serv_out,graceful=True)
        self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
        rest.set_recovery_type(serv_out,recovery_type)
        rest.add_back_node(serv_out)
        rest.rebalance(known_nodes)
        self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")

        for server in servs_inout:
            status = x509main(server)._validate_ssl_login()
            self.assertEqual(status,200,"Not able to login via SSL code")
Ejemplo n.º 10
0
 def test_failover_add_back(self):
     try:
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         nodes_all = rest.node_statuses()
         tasks = self.async_check_and_run_operations(buckets=self.buckets, before=True)
         for task in tasks:
             task.result()
         failover_task = self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all if node.ip == failover_node.ip])
         for node in nodes:
             self.log.info(node)
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(self.servers[: self.nodes_init], [], [])
         self._run_aync_tasks()
         rebalance.result()
         self.run_after_operations()
     except Exception, ex:
         raise
Ejemplo n.º 11
0
 def test_add_remove_add_back_node_with_cert(self,rebalance=None):
     rebalance = self.input.param('rebalance')
     rest = RestConnection(self.master)
     servs_inout = self.servers[1:3]
     serv_out = 'ns_1@' + servs_inout[1].ip
     known_nodes = ['ns_1@'+self.master.ip]
     x509main(self.master).setup_master()
     x509main().setup_cluster_nodes_ssl(servs_inout)
     for server in servs_inout:
         rest.add_node('Administrator','password',server.ip)
         known_nodes.append('ns_1@' + server.ip)
     rest.rebalance(known_nodes)
     self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
     for server in servs_inout:
         status = x509main(server)._validate_ssl_login()
         self.assertEqual(status,200,"Not able to login via SSL code")
     rest.fail_over(serv_out,graceful=False)
     if (rebalance):
         rest.rebalance(known_nodes,[serv_out])
         self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
         rest.add_node('Administrator','password',servs_inout[1].ip)
     else:
         rest.add_back_node(serv_out)
     rest.rebalance(known_nodes)
     self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance")
     for server in servs_inout:
         response = x509main(server)._validate_ssl_login()
         self.assertEqual(status,200,"Not able to login via SSL code")
Ejemplo n.º 12
0
 def test_rename_failover_add_back(self):
     if len(self.servers) < 2:
         self.fail("test require more than 1 node")
     failover_factor = self.input.param("failover-factor", 1)
     failover_nodes = self.servers[self.nodes_in:self.nodes_in +
                                   failover_factor + 1]
     hostnames = self.rename_nodes(self.servers[:self.nodes_in +
                                                failover_factor + 1])
     self._set_hostames_to_servers_objs(hostnames)
     self.verify_referenced_by_names(
         self.servers[:self.nodes_in + failover_factor + 1], hostnames)
     self.cluster.rebalance(self.servers[:self.nodes_init],
                            self.servers[self.nodes_init:self.nodes_in +
                                         failover_factor + 1], [],
                            use_hostnames=True)
     rest = RestConnection(self.master)
     nodes_all = rest.node_statuses()
     nodes = []
     for failover_node in failover_nodes:
         nodes.extend([
             node for node in nodes_all if node.ip == failover_node.hostname
             and str(node.port) == failover_node.port
         ])
     self.cluster.failover(self.servers, failover_nodes, use_hostnames=True)
     self.verify_referenced_by_names(
         self.servers[:self.nodes_in + failover_factor + 1], hostnames)
     for node in nodes:
         rest.add_back_node(node.id)
     self.cluster.rebalance(self.servers[:self.nodes_in + failover_factor +
                                         1], [], [],
                            use_hostnames=True)
     self.verify_referenced_by_names(
         self.servers[:self.nodes_in + failover_factor + 1], hostnames)
 def test_failover_indexer_add_back(self):
     """
     Indexer add back scenarios
     :return:
     """
     self._calculate_scan_vector()
     rest = RestConnection(self.master)
     recoveryType = self.input.param("recoveryType", "full")
     indexer_out = int(self.input.param("nodes_out", 0))
     nodes = self.get_nodes_from_services_map(service_type="index",
                                              get_all_nodes=True)
     self.assertGreaterEqual(len(nodes), indexer_out,
                             "Existing Indexer Nodes less than Indexer out nodes")
     pre_recovery_tasks = self.async_run_operations(phase="before")
     self._run_tasks([pre_recovery_tasks])
     self._start_disk_writes_for_plasma()
     kvOps_tasks = self._run_kvops_tasks()
     try:
         self.use_replica = False
         self._create_replica_indexes()
         servr_out = nodes[:indexer_out]
         failover_task =self.cluster.async_failover(
             [self.master], failover_nodes=servr_out,
             graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if node.ip == failover_node.ip])
             for node in nodes:
                 log.info("Adding back {0} with recovery type {1}...".format(
                     node.ip, recoveryType))
                 rest.add_back_node(node.id)
                 rest.set_recovery_type(otpNode=node.id,
                                        recoveryType=recoveryType)
         log.info("Rebalancing nodes in...")
         mid_recovery_tasks = self.async_run_operations(phase="in_between")
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [], [])
         rebalance.result()
         self._run_tasks([mid_recovery_tasks, kvOps_tasks])
         #check if the nodes in cluster are healthy
         msg = "Cluster not in Healthy state"
         self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
         log.info("==== Cluster in healthy state ====")
         self._check_all_bucket_items_indexed()
         post_recovery_tasks = self.async_run_operations(phase="after")
         self._run_tasks([post_recovery_tasks])
     except Exception as ex:
         log.info(str(ex))
         raise
Ejemplo n.º 14
0
 def test_online_upgrade_with_failover(self):
     upgrade_nodes = self.servers[:self.nodes_init]
     if self.disable_plasma_upgrade:
         self._install(self.nodes_in_list, version=self.upgrade_to)
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [self.nodes_in_list[0]], [],
             services=["index"])
         rebalance.result()
         self.disable_upgrade_to_plasma(self.nodes_in_list[0])
     for node in upgrade_nodes:
         node_rest = RestConnection(node)
         node_info = "{0}:{1}".format(node.ip, node.port)
         node_services_list = node_rest.get_nodes_services()[node_info]
         if "index" in node_services_list:
             self._create_equivalent_indexes(node)
         failover_task = self.cluster.async_failover([self.master],
                                                     failover_nodes=[node],
                                                     graceful=False)
         failover_task.result()
         log.info("Node Failed over...")
         upgrade_th = self._async_update(self.upgrade_to, [node])
         for th in upgrade_th:
             th.join()
         log.info("==== Upgrade Complete ====")
         self.sleep(120)
         rest = RestConnection(self.master)
         nodes_all = rest.node_statuses()
         for cluster_node in nodes_all:
             if cluster_node.ip == node.ip:
                 log.info("Adding Back: {0}".format(node))
                 rest.add_back_node(cluster_node.id)
                 rest.set_recovery_type(otpNode=cluster_node.id,
                                        recoveryType="full")
         log.info("Adding node back to cluster...")
         active_nodes = [
             srvr for srvr in self.servers if srvr.ip != node.ip
         ]
         rebalance = self.cluster.async_rebalance(active_nodes, [], [])
         rebalance.result()
         self._remove_equivalent_indexes(node)
         self.sleep(60)
     msg = "Cluster is not healthy after upgrade"
     self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
     log.info("Cluster is healthy")
     if self.initial_version.split("-")[0] in UPGRADE_VERS:
         self.multi_drop_index()
         self.sleep(100)
         self._create_indexes()
         self.sleep(100)
     self.assertTrue(self.wait_until_indexes_online(),
                     "Some indexes are not online")
     log.info("All indexes are online")
     self._query_index("post_upgrade")
     self._verify_post_upgrade_results()
     self._update_int64_dataset()
     self._query_for_long_num()
Ejemplo n.º 15
0
 def test_failover_indexer_add_back(self):
     """
     Indexer add back scenarios
     :return:
     """
     rest = RestConnection(self.master)
     recoveryType = self.input.param("recoveryType", "full")
     indexer_out = int(self.input.param("nodes_out", 0))
     nodes = self.get_nodes_from_services_map(service_type="index",
                                              get_all_nodes=True)
     self.assertGreaterEqual(len(nodes), indexer_out,
                             "Existing Indexer Nodes less than Indexer out nodes")
     pre_recovery_tasks = self.async_run_operations(phase="before")
     self._run_tasks([pre_recovery_tasks])
     self.get_dgm_for_plasma()
     kvOps_tasks = self._run_kvops_tasks()
     try:
         self.use_replica = False
         self._create_replica_indexes()
         servr_out = nodes[:indexer_out]
         failover_task =self.cluster.async_failover(
             [self.master], failover_nodes=servr_out,
             graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if node.ip == failover_node.ip])
             for node in nodes:
                 log.info("Adding back {0} with recovery type {1}...".format(
                     node.ip, recoveryType))
                 rest.add_back_node(node.id)
                 rest.set_recovery_type(otpNode=node.id,
                                        recoveryType=recoveryType)
         log.info("Rebalancing nodes in...")
         mid_recovery_tasks = self.async_run_operations(phase="in_between")
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [], [])
         rebalance.result()
         self._run_tasks([mid_recovery_tasks, kvOps_tasks])
         #check if the nodes in cluster are healthy
         msg = "Cluster not in Healthy state"
         self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
         log.info("==== Cluster in healthy state ====")
         self._check_all_bucket_items_indexed()
         post_recovery_tasks = self.async_run_operations(phase="after")
         self._run_tasks([post_recovery_tasks])
     except Exception, ex:
         log.info(str(ex))
         raise
 def test_online_upgrade_with_failover(self):
     upgrade_nodes = self.servers[:self.nodes_init]
     if self.disable_plasma_upgrade:
         self._install(self.nodes_in_list, version=self.upgrade_to)
         rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init],
                                                  [self.nodes_in_list[0]], [],
                                                  services=["index"])
         rebalance.result()
         self.sleep(100)
         self.disable_upgrade_to_plasma(self.nodes_in_list[0])
     for node in upgrade_nodes:
         node_rest = RestConnection(node)
         node_info = "{0}:{1}".format(node.ip, node.port)
         node_services_list = node_rest.get_nodes_services()[node_info]
         if "index" in node_services_list:
             self._create_equivalent_indexes(node)
         failover_task = self.cluster.async_failover([self.master], failover_nodes=[node], graceful=False)
         failover_task.result()
         self.sleep(100)
         log.info("Node Failed over...")
         upgrade_th = self._async_update(self.upgrade_to, [node])
         for th in upgrade_th:
             th.join()
         log.info("==== Upgrade Complete ====")
         self.sleep(120)
         rest = RestConnection(self.master)
         nodes_all = rest.node_statuses()
         for cluster_node in nodes_all:
             if cluster_node.ip == node.ip:
                 log.info("Adding Back: {0}".format(node))
                 rest.add_back_node(cluster_node.id)
                 rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full")
         log.info("Adding node back to cluster...")
         active_nodes = [srvr for srvr in self.servers if srvr.ip != node.ip]
         rebalance = self.cluster.async_rebalance(active_nodes, [], [])
         rebalance.result()
         self.sleep(100)
         self._remove_equivalent_indexes(node)
         self.sleep(60)
     msg = "Cluster is not healthy after upgrade"
     self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
     log.info("Cluster is healthy")
     self.add_built_in_server_user()
     self.sleep(20)
     if self.initial_version.split("-")[0] in UPGRADE_VERS:
         self.multi_drop_index()
         self.sleep(100)
         self._create_indexes()
         self.sleep(100)
     self.assertTrue(self.wait_until_indexes_online(), "Some indexes are not online")
     log.info("All indexes are online")
     self._query_index("post_upgrade")
     self._verify_post_upgrade_results()
     self._update_int64_dataset()
     self._query_for_long_num()
 def test_failover_indexer_add_back(self):
     """
     Indexer add back scenarios
     :return:
     """
     self._calculate_scan_vector()
     rest = RestConnection(self.master)
     recoveryType = self.input.param("recoveryType", "full")
     indexer_out = int(self.input.param("nodes_out", 0))
     nodes = self.get_nodes_from_services_map(service_type="index",
                                              get_all_nodes=True)
     self.assertGreaterEqual(
         len(nodes), indexer_out,
         "Existing Indexer Nodes less than Indexer out nodes")
     log.info("Running kv Mutations...")
     kvOps_tasks = self.kv_mutations()
     servr_out = nodes[:indexer_out]
     failover_task = self.cluster.async_failover([self.master],
                                                 failover_nodes=servr_out,
                                                 graceful=self.graceful)
     self._run_tasks([[failover_task], kvOps_tasks])
     before_index_ops = self._run_before_index_tasks()
     nodes_all = rest.node_statuses()
     nodes = []
     if servr_out[0].ip == "127.0.0.1":
         for failover_node in servr_out:
             nodes.extend([
                 node for node in nodes_all
                 if (str(node.port) == failover_node.port)
             ])
     else:
         for failover_node in servr_out:
             nodes.extend([
                 node for node in nodes_all if node.ip == failover_node.ip
             ])
         for node in nodes:
             log.info("Adding back {0} with recovery type {1}...".format(
                 node.ip, recoveryType))
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id,
                                    recoveryType=recoveryType)
     log.info("Rebalancing nodes in...")
     rebalance = self.cluster.async_rebalance(
         self.servers[:self.nodes_init], [], [])
     log.info("Running KV mutations...")
     kvOps_tasks = self.kv_mutations()
     self._run_tasks([[rebalance], kvOps_tasks])
     self.sleep(100)
     self._verify_bucket_count_with_index_count(self.load_query_definitions)
     self.multi_query_using_index(
         buckets=self.buckets,
         query_definitions=self.load_query_definitions)
Ejemplo n.º 18
0
 def test_indexer_failover_add_back(self):
     rest = RestConnection(self.master)
     self.generate_map_nodes_out_dist()
     index_names_defn = self._create_array_index_definitions()
     try:
         failover_task = self.cluster.async_failover(
             [self.master],
             failover_nodes=self.nodes_out_list,
             graceful=self.graceful)
         failover_task.result()
         nodes_all = rest.node_statuses()
         nodes = []
         if self.nodes_out_list[0].ip == "127.0.0.1":
             for failover_node in self.nodes_out_list:
                 nodes.extend([
                     node for node in nodes_all
                     if (str(node.port) == failover_node.port)
                 ])
         else:
             for failover_node in self.nodes_out_list:
                 nodes.extend([
                     node for node in nodes_all
                     if node.ip == failover_node.ip
                 ])
             for node in nodes:
                 log.info(
                     "Adding back {0} with recovery type Full...".format(
                         node.ip))
                 rest.add_back_node(node.id)
                 rest.set_recovery_type(otpNode=node.id,
                                        recoveryType="full")
         log.info("Rebalancing nodes in...")
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [], [])
         mid_recovery_tasks = threading.Thread(
             target=self._aggregate_query_using_index,
             args=(index_names_defn, ))
         mid_recovery_tasks.start()
         rebalance.result()
         mid_recovery_tasks.join()
         #check if the nodes in cluster are healthy
         msg = "Cluster not in Healthy state"
         self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
         log.info("==== Cluster in healthy state ====")
         self.sleep(60)
     except Exception, ex:
         log.info(str(ex))
         raise
Ejemplo n.º 19
0
 def test_failover_add_back(self):
     try:
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         failover_task = self.cluster.async_failover(
             [self.master],
             failover_nodes=servr_out,
             graceful=self.graceful)
         failover_task.result()
         pre_recovery_tasks = self.async_run_operations(phase="before")
         self._run_tasks([pre_recovery_tasks])
         self.get_dgm_for_plasma()
         kvOps_tasks = self._run_kvops_tasks()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([
                     node for node in nodes_all
                     if (str(node.port) == failover_node.port)
                 ])
         else:
             for failover_node in servr_out:
                 nodes.extend([
                     node for node in nodes_all
                     if node.ip == failover_node.ip
                 ])
         for node in nodes:
             log.info("Adding Back: {0}".format(node))
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id,
                                    recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [], [])
         mid_recovery_tasks = self.async_run_operations(phase="in_between")
         rebalance.result()
         self._run_tasks([kvOps_tasks, mid_recovery_tasks])
         #check if the nodes in cluster are healthy
         msg = "Cluster not in Healthy state"
         self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
         log.info("==== Cluster in healthy state ====")
         self._check_all_bucket_items_indexed()
         post_recovery_tasks = self.async_run_operations(phase="after")
         self._run_tasks([post_recovery_tasks])
     except Exception as ex:
         log.info(str(ex))
         raise
Ejemplo n.º 20
0
 def test_failover_indexer_add_back(self):
     """
     Indexer add back scenarios
     :return:
     """
     self._calculate_scan_vector()
     rest = RestConnection(self.master)
     recoveryType = self.input.param("recoveryType", "full")
     indexer_out = int(self.input.param("nodes_out", 0))
     nodes = self.get_nodes_from_services_map(service_type="index", get_all_nodes=True)
     self.assertGreaterEqual(len(nodes), indexer_out,
                             "Existing Indexer Nodes less than Indexer out nodes")
     log.info("Running kv Mutations...")
     kvOps_tasks = self.kv_mutations()
     servr_out = nodes[:indexer_out]
     failover_task =self.cluster.async_failover([self.master],
                 failover_nodes = servr_out, graceful=self.graceful)
     self._run_tasks([[failover_task], kvOps_tasks])
     before_index_ops = self._run_before_index_tasks()
     nodes_all = rest.node_statuses()
     nodes = []
     if servr_out[0].ip == "127.0.0.1":
         for failover_node in servr_out:
             nodes.extend([node for node in nodes_all
                 if (str(node.port) == failover_node.port)])
     else:
         for failover_node in servr_out:
             nodes.extend([node for node in nodes_all
                 if node.ip == failover_node.ip])
         for node in nodes:
             log.info("Adding back {0} with recovery type {1}...".format(node.ip, recoveryType))
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType)
     log.info("Rebalancing nodes in...")
     rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], [])
     log.info("Running KV mutations...")
     kvOps_tasks = self.kv_mutations()
     self._run_tasks([[rebalance], kvOps_tasks])
     self.sleep(100)
     self._verify_bucket_count_with_index_count(self.load_query_definitions)
     self.multi_query_using_index(buckets=self.buckets,
             query_definitions=self.load_query_definitions)
Ejemplo n.º 21
0
 def test_failover_add_back(self):
     try:
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         failover_task =self.cluster.async_failover([self.master],
                 failover_nodes=servr_out, graceful=self.graceful)
         failover_task.result()
         pre_recovery_tasks = self.async_run_operations(phase="before")
         self._run_tasks([pre_recovery_tasks])
         self.get_dgm_for_plasma()
         kvOps_tasks = self._run_kvops_tasks()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if (str(node.port) == failover_node.port)])
         else:
             for failover_node in servr_out:
                 nodes.extend([node for node in nodes_all
                     if node.ip == failover_node.ip])
         for node in nodes:
             log.info("Adding Back: {0}".format(node))
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id,
                                    recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [], [])
         mid_recovery_tasks = self.async_run_operations(phase="in_between")
         rebalance.result()
         self._run_tasks([kvOps_tasks, mid_recovery_tasks])
         #check if the nodes in cluster are healthy
         msg = "Cluster not in Healthy state"
         self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
         log.info("==== Cluster in healthy state ====")
         self._check_all_bucket_items_indexed()
         post_recovery_tasks = self.async_run_operations(phase="after")
         self._run_tasks([post_recovery_tasks])
     except Exception, ex:
         log.info(str(ex))
         raise
 def test_failover_add_back(self):
     try:
         rest = RestConnection(self.master)
         recoveryType = self.input.param("recoveryType", "full")
         servr_out = self.nodes_out_list
         self._run_initial_index_tasks()
         failover_task = self.cluster.async_failover(
             [self.master],
             failover_nodes=servr_out,
             graceful=self.graceful)
         failover_task.result()
         kvOps_tasks = self._run_kvops_tasks()
         before_index_ops = self._run_before_index_tasks()
         nodes_all = rest.node_statuses()
         nodes = []
         if servr_out[0].ip == "127.0.0.1":
             for failover_node in servr_out:
                 nodes.extend([
                     node for node in nodes_all
                     if (str(node.port) == failover_node.port)
                 ])
         else:
             for failover_node in servr_out:
                 nodes.extend([
                     node for node in nodes_all
                     if node.ip == failover_node.ip
                 ])
         for node in nodes:
             self.log.info(node)
             rest.add_back_node(node.id)
             rest.set_recovery_type(otpNode=node.id,
                                    recoveryType=recoveryType)
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [], [])
         in_between_index_ops = self._run_in_between_tasks()
         rebalance.result()
         self.sleep(120)
         self._run_tasks(
             [kvOps_tasks, before_index_ops, in_between_index_ops])
         self._run_after_index_tasks()
     except Exception, ex:
         raise
Ejemplo n.º 23
0
 def online_upgrade_with_failover(self, upgrade_servers):
     self.log.info("online upgrade servers: {0}".format(str(upgrade_servers)))
     for server in upgrade_servers:
         self.log.info("upgrading: {0}".format(str(server)))
         participating_servers = [s for s in self.servers]
         failover_task = self.cluster.async_failover([self.master], failover_nodes=[server], graceful=False)
         failover_task.result()
         upgrade_th = self._async_update(self.upgrade_versions[0], [server])
         for th in upgrade_th:
             th.join()
         rest = RestConnection(self.master)
         nodes_all = rest.node_statuses()
         for cluster_node in nodes_all:
             if cluster_node.ip == server.ip:
                 rest.add_back_node(cluster_node.id)
                 rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full")
         participating_servers.remove(server)
         self.log.info("participating servers: {0}".format(str(participating_servers)))
         rebalance = self.cluster.async_rebalance(participating_servers, [], [])
         rebalance.result()
Ejemplo n.º 24
0
 def perform_failover(self):
     rest = RestConnection(self.master)
     nodes = rest.node_statuses()
     failover_servers = self.servers[:self.
                                     nodes_init][-self.failover_factor:]
     failover_nodes = []
     for server in failover_servers:
         for node in nodes:
             if node.ip == server.ip and str(node.port) == server.port:
                 failover_nodes.append(node)
     for node in failover_nodes:
         rest.fail_over(node.id)
         self.sleep(5)
     if self.failover == GetrTests.FAILOVER_REBALANCE:
         self.cluster.rebalance(self.servers[:self.nodes_init], [],
                                failover_servers)
     if self.failover == GetrTests.FAILOVER_ADD_BACK:
         for node in failover_nodes:
             rest.add_back_node(node.id)
         self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
Ejemplo n.º 25
0
 def online_upgrade_with_failover(self, upgrade_servers):
     self.log.info("online upgrade servers: {0}".format(str(upgrade_servers)))
     for server in upgrade_servers:
         self.log.info("upgrading: {0}".format(str(server)))
         participating_servers = [s for s in self.servers]
         failover_task = self.cluster.async_failover([self.master], failover_nodes=[server], graceful=False)
         failover_task.result()
         upgrade_th = self._async_update(self.upgrade_versions[0], [server])
         for th in upgrade_th:
             th.join()
         rest = RestConnection(self.master)
         nodes_all = rest.node_statuses()
         for cluster_node in nodes_all:
             if cluster_node.ip == server.ip:
                 rest.add_back_node(cluster_node.id)
                 rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full")
         participating_servers.remove(server)
         self.log.info("participating servers: {0}".format(str(participating_servers)))
         rebalance = self.cluster.async_rebalance(participating_servers, [], [])
         rebalance.result()
Ejemplo n.º 26
0
 def perform_failover(self):
     rest = RestConnection(self.master)
     nodes = rest.node_statuses()
     failover_servers = self.servers[:self.nodes_init][-self.failover_factor:]
     failover_nodes = []
     for server in failover_servers:
         for node in nodes:
             if node.ip == server.ip and str(node.port) == server.port:
                 failover_nodes.append(node)
     for node in failover_nodes:
         rest.fail_over(node.id)
         self.sleep(5)
     if self.failover == GetrTests.FAILOVER_REBALANCE:
         self.cluster.rebalance(self.servers[:self.nodes_init],
                            [], failover_servers)
     if self.failover == GetrTests.FAILOVER_ADD_BACK:
         for node in failover_nodes:
             rest.add_back_node(node.id)
         self.cluster.rebalance(self.servers[:self.nodes_init],
                                [], [])
Ejemplo n.º 27
0
    def test_backwards_compatability_indexes(self):
        create_index_query = "CREATE INDEX idx_name ON {0}(name)".format(
            self.bucket_name)
        self.n1ql_helper.run_cbq_query(query=create_index_query,
                                       server=self.n1ql_node)
        self.wait_until_indexes_online()

        result = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where name = "employee-9"'.format(
                self.bucket_name))
        self.assertEqual(result['metrics']['resultCount'], 72)

        upgrade_nodes = self.servers[:self.nodes_init]

        for server in upgrade_nodes:
            remote = RemoteMachineShellConnection(server)
            remote.stop_server()
            remote.disconnect()
            upgrade_threads = self._async_update(self.upgrade_to, [server])
            for upgrade_thread in upgrade_threads:
                upgrade_thread.join()
            self.upgrade_servers.append(server)
        self.sleep(180)
        msg = "Cluster is not healthy after upgrade"
        self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
        self.log.info("Cluster is healthy")
        rest = RestConnection(self.master)
        nodes_all = rest.node_statuses()
        try:
            for cluster_node in nodes_all:
                if cluster_node.ip == self.master.ip:
                    self.log.info("Adding Back: {0}".format(self.master.ip))
                    rest.add_back_node(cluster_node.id)
                    rest.set_recovery_type(otpNode=cluster_node.id,
                                           recoveryType="full")
        except Exception as e:
            self.log.error(str(e))
        self.log.info("Adding node back to cluster...")
        rebalance = self.cluster.async_rebalance(
            self.servers[:self.nodes_init], [], [])
        rebalance.result()
        self.assertTrue(self.wait_until_indexes_online(),
                        "Some indexes are not online")
        self.log.info("All indexes are online")
        self.add_built_in_server_user()
        self.sleep(20)

        try:
            create_index_query = "CREATE INDEX idx_name ON {0}(name)".format(
                self.bucket_name)
        except Exception as e:
            self.log.info("indexes already exist")
        try:
            self.n1ql_helper.run_cbq_query(query=create_index_query,
                                           server=self.n1ql_node)
            create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format(
                self.bucket_name)
            self.n1ql_helper.run_cbq_query(query=create_index_query,
                                           server=self.n1ql_node)
            self.wait_until_indexes_online()
        except Exception as e:
            self.log.info("indexes already exist")

        result = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where name = "employee-9"'.format(
                self.bucket_name))
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where join_day = 9'.format(
                self.bucket_name))
        self.assertEqual(result2['metrics']['resultCount'], 72)

        self.n1ql_helper.create_scope(server=self.master,
                                      bucket_name=self.bucket_name,
                                      scope_name="test")
        self.n1ql_helper.create_collection(server=self.master,
                                           bucket_name=self.bucket_name,
                                           scope_name="test",
                                           collection_name="test1")
        self.n1ql_helper.create_collection(server=self.master,
                                           bucket_name=self.bucket_name,
                                           scope_name="test",
                                           collection_name="test2")

        self.n1ql_helper.run_cbq_query(query=(
            'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) +
            '(KEY, VALUE) VALUES ("key2", { "type" : "hotel", "name" : "new hotel" })'
        ))
        self.n1ql_helper.run_cbq_query(query=(
            'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) +
            '(KEY, VALUE) VALUES ("key1", { "type" : "hotel", "name" : "old hotel" })'
        ))
        self.n1ql_helper.run_cbq_query(query=(
            'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) +
            ' (KEY, VALUE) VALUES ("key3", { "nested" : {"fields": "fake"}, "name" : "old hotel" })'
        ))
        self.n1ql_helper.run_cbq_query(query=(
            'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) +
            ' (KEY, VALUE) VALUES ("key4", { "numbers": [1,2,3,4] , "name" : "old hotel" })'
        ))
        time.sleep(20)

        self.n1ql_helper.run_cbq_query(
            query="CREATE INDEX idx1 on default:{0}.test.test1(name) ".format(
                self.bucket_name))
        self.n1ql_helper.run_cbq_query(
            query="CREATE INDEX idx2 on default:{0}.test.test1(name) ".format(
                self.bucket_name))
        self.n1ql_helper.run_cbq_query(
            query="CREATE INDEX idx3 on default:{0}.test.test1(nested)".format(
                self.bucket_name))
        self.n1ql_helper.run_cbq_query(
            query="CREATE INDEX idx4 on default:{0}.test.test1(ALL numbers)".
            format(self.bucket_name))
Ejemplo n.º 28
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp(self)

    def tearDown(self):
        super(FailoverTests, self).tearDown(self)

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case (before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info(" Picking node {0} as reference node for test case".format(self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=1, target_node = self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform View Creation Tasks and check for completion if required before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validaiton
        record_static_data_set ={}
        prev_vbucket_stats = {}
        prev_failover_stats = {}
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets)

        # Perform Operations relalted to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
        else:
            return

        if self.during_ops == None:
            self.verify_unacked_bytes_all_buckets(filter_list = self.filter_list, master_node = self.master)

    def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.master.rest_password
                self.change_password(new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.master)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.master)
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master,bucket)
        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers,chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node = chosen[0])
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Reset password or port
        if self.during_ops:
            if self.during_ops == "change_password":
                self.change_password(new_password=old_pass)
            elif self.during_ops == "change_port":
                self.change_port(new_port='8091',
                current_port=self.input.param("new_port", "9090"))
            return

        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.verify_cluster_stats(_servers_, self.master, check_bucket_stats = True, check_ep_items_remaining = True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None, addedItems = None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats =  self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True)
        serverMap =  self.get_server_map(self.servers)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap)
        index = 0
        for node in chosen:
            self.rest.add_back_node(node.id)
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master,bucket)

        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers,chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node = chosen[0])
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets)

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}".format(chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        #  Drain ep_queue and make sure that intra-cluster replication is complete
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers, self.master, check_bucket_stats = True, check_ep_items_remaining = True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, serverMap, self.buckets,recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set,self.servers, self.buckets,  path = None, addedItems = None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False)
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable=True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable=True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
                    msg = "graceful failover failed for nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(failed_over,""" Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers,chosen)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets, type = "failover", graceful = (self.graceful and graceful_failover) )

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable=True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable=True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
        nodes = self.filter_servers(self.servers,chosen)
        failed_over = self.cluster.async_failover([self.master], failover_nodes = chosen, graceful=self.graceful)
        # Perform Compaction
        compact_tasks = []
        if self.compact:
            for bucket in self.buckets:
                compact_tasks.append(self.cluster.async_compact_bucket(self.master,bucket))
        # Run View Operations
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)
        # Run mutation operations
        if self.withMutationOps:
            self.run_mutation_operations()
        failed_over.result()
        for task in compact_tasks:
            task.result()
        msg = "rebalance failed while removing failover nodes {0}".format(node.id)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)


    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks = []
        tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag = 2, batch_size=20000)
        for task in tasks:
            task.result()
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True)
        self._verify_stats_all_buckets(self.servers,timeout = 120)

    def run_mutation_operations(self):
        mutation_ops_tasks = []
        if("create" in self.doc_ops):
            mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_create, "create", 0)
        if("update" in self.doc_ops):
            mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0)
        if("delete" in self.doc_ops):
            mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                    task.result()
        except Exception, ex:
            self.log.info(ex)
Ejemplo n.º 29
0
class RebalanceProgressTests(RebalanceBaseTest):

    def setUp(self):
        super(RebalanceProgressTests, self).setUp()
        self.rest = RestConnection(self.master)
        self.num_views = self.input.param("num_views", 3)
        if self.num_views:
            self._create_indexes()

    def tearDown(self):
        super(RebalanceProgressTests, self).tearDown()

    def test_progress_rebalance_in(self):
        servers_in = self.servers[self.nodes_init : self.nodes_init + self.nodes_in]
        servers_init = self.servers[:self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()

        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal and docsTransferred should be 0 in added nodes
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_in, previous_stats, new_stats, "outgoing",
                              docs_total=0, docs_transf=0)
            self._check_stats(servers_in, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_init, previous_stats, new_stats, "ingoing",
                              docs_total=0, docs_transf=0)
            self._check_stats(servers_init, previous_stats, new_stats, "outgoing")
            #sum of sending and receiving vbuckets should coincide
            self._check_vb_sums(servers_init, servers_in, new_stats)
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_rebalance_out(self):
        with_failover = self.input.param("with_failover", False)
        servers_init = self.servers[:self.nodes_init]
        servers_out = self.servers[(self.nodes_init - self.nodes_out) : self.nodes_init]

        if with_failover:
            self.cluster.failover(servers_init, servers_out)
        rebalance = self.cluster.async_rebalance(servers_init, [], servers_out)
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_init, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_init, previous_stats, new_stats, "outgoing")
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_rebalance_swap(self):
        if self.nodes_in != self.nodes_out:
            self.fail("nodes_in != nodes_out. Not a swap rebalance")
        if len(self.servers) < (self.nodes_init + self.nodes_in):
            self.log.error("Not enough VMs!")
            return
        servers_in = self.servers[self.nodes_init : self.nodes_init + self.nodes_in]
        servers_init = self.servers[:self.nodes_init]
        servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)]
        servers_out = self.servers[(self.nodes_init - self.nodes_out) : self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out)
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal and docsTransferred should be 0 in added nodes
            #no vbuckets moving for unchanged nodes
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_in, previous_stats, new_stats, "outgoing",
                              docs_total=0, docs_transf=0)
            self._check_stats(servers_in, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_unchanged, previous_stats, new_stats, "ingoing",
                              active_vb=0, replica_vb=0)
            self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing",
                              active_vb=0, replica_vb=0)
            self._check_stats(servers_out, previous_stats, new_stats, "outgoing")

            #sum of sending and receiving vbuckets should coincide
            self._check_vb_sums(servers_in, servers_out, new_stats)
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_add_back_after_failover(self):
        servers_init = self.servers[:self.nodes_init]
        servers_failover = self.servers[(self.nodes_init - self.nodes_out) : self.nodes_init]
        servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)]
        nodes_all = self.rest.node_statuses()

        failover_nodes = []
        for failover_server in servers_failover:
            failover_nodes.extend(filter(lambda node: node.ip == failover_server.ip and \
                                         str(node.port) == failover_server.port, nodes_all))
        self.cluster.failover(servers_init, servers_failover)
        for node in failover_nodes:
            self.rest.add_back_node(node.id)

        rebalance = self.cluster.async_rebalance(servers_init, [], [])
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing")
            self._check_stats(servers_failover, previous_stats, new_stats, "ingoing")
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def _check_vb_sums(self, servers_ingoing, servers_outgoing, new_stats):
        active_vb_sum_1 = sum([new_stats[server.ip]["ingoing"]['activeVBucketsLeft'] for server in servers_ingoing])
        active_vb_sum_2 = sum([new_stats[server.ip]["outgoing"]['activeVBucketsLeft'] for server in servers_outgoing])
        self.assertTrue(active_vb_sum_1 == active_vb_sum_2,
                        "Active vbuckets left should be equal in servers_in and init. %s" % new_stats)

    def _check_stats(self, servers, previous_stats, new_stats, type,
                     docs_total=None, docs_transf=None,
                     active_vb=None, replica_vb=None):
        self.assertTrue(new_stats["buckets_count"] == len(self.buckets),
                        "Expected buckets %s. Actual stat %s" %(
                                len(self.buckets), new_stats))
        for server in servers:
            current_stat = new_stats[server.ip][type]
            previous_stat =  previous_stats[server.ip][type]
            if new_stats["bucket"] != previous_stats["bucket"]:
                self.assertTrue(current_stat['activeVBucketsLeft'] >= previous_stat['activeVBucketsLeft'],
                            "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %(
                                  server.ip, current_stat, previous_stat))
                self.assertTrue(current_stat['replicaVBucketsLeft'] >= previous_stat['replicaVBucketsLeft'],
                                "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %(
                                      server.ip, current_stat, previous_stat))
            else:
                self.assertTrue(current_stat['activeVBucketsLeft'] <= previous_stat['activeVBucketsLeft'],
                                "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %(
                                      server.ip, current_stat, previous_stat))
                self.assertTrue(current_stat['replicaVBucketsLeft'] <= previous_stat['replicaVBucketsLeft'],
                                "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %(
                                      server.ip, current_stat, previous_stat))
                try:
                    if current_stat['docsTotal'] != previous_stat['docsTotal']:
                        self.log.warn("docsTotal for node %s changed! Previous stat %s. Actual: %s" %(
                                          server.ip, current_stat, previous_stat))
                except Exception, ex:
                    if previous_stat['docsTotal'] != 0 and current_stat['docsTotal'] == 0:
                        command = "sys:get_status({global, ns_rebalance_observer})."
                        self.log.info("posting: %s" % command)
                        self.rest.diag_eval(command)
                    raise ex
                self.assertTrue(current_stat['docsTransferred'] >= previous_stat['docsTransferred'],
                                "docsTransferred for node %s decreased! Previous stat %s. Actual: %s" %(
                                      server.ip, current_stat, previous_stat))
            if docs_total is not None:
                self.assertTrue(current_stat['docsTotal'] == docs_total,
                                "DocTotal for %s is %s, but should be %s. Stat %s" % (
                                        server.ip, current_stat['docsTotal'], docs_total, current_stat))
            if docs_transf is not None:
                self.assertTrue(current_stat['docsTransferred'] == docs_transf,
                                "docsTransferred for %s is %s, but should be %s. Stat %s" % (
                                        server.ip, current_stat['docsTotal'], docs_transf, current_stat))
            if active_vb is not None:
                self.assertTrue(current_stat['activeVBucketsLeft'] == active_vb,
                                "docsTransferred for %s is %s, but should be %s. Stat %s" % (
                                        server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat))
            if replica_vb is not None:
                self.assertTrue(current_stat['replicaVBucketsLeft'] == replica_vb,
                                "docsTransferred for %s is %s, but should be %s. Stat %s" % (
                                        server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat))
            self.log.info("Checked stat: %s" % new_stats)
Ejemplo n.º 30
0
    def test_volume_with_rebalance(self):
        self.src_bucket = RestConnection(self.master).get_buckets()
        rest = RestConnection(self.master)
        bucket = rest.get_buckets()
        # for bk in bucket:
        #     rest.flush_bucket(bk)
        #self.sleep(30)
        #load initial documents
        self.create_ddocs_and_views()
        load_thread=[]
        import Queue
        queue = Queue.Queue()
        for b in bucket:
            load_thread.append(Thread(target=lambda  q,args1,args2,args3: q.put(self.load(args1, args2, args3)), args=(queue, self.master, self.num_items, b)))
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items,b)))
        for t in load_thread:
            t.start()
        servers_init = self.servers[:self.nodes_init]
        new_server_list=self.servers[0:self.nodes_init]
        for t in load_thread:
            t.join()
        self.sleep(30)
        #Reload more data for mutations
        load_thread=[]
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items,b,self.num_items)))
        for t in load_thread:
            t.start()
        # #Rebalance in 1 node
        self.log.info("==========rebalance in 1 node=========")
        servers_in=self.servers[self.nodes_init:self.nodes_init + 1]
        rebalance = self.cluster.async_rebalance(servers_init,
                                                 servers_in, [])

        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*2)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b,self.num_items*2)))
        for t in load_thread:
            t.start()
        #rebalance out 1 node
        new_server_list = self.servers[0:self.nodes_init]+ servers_in
        self.log.info("==========rebalance out 1 node=========")
        servers_out=[self.servers[self.nodes_init]]
        rebalance = self.cluster.async_rebalance(servers_init,[],
                                                 servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*3)
        self.sleep(30)
         # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*3)))
        for t in load_thread:
            t.start()
        new_server_list=list(set(new_server_list)- set(servers_out))
        #swap rebalance 1 node
        self.log.info("==========swap rebalance 1 node=========")
        servers_in = self.servers[self.nodes_init : self.nodes_init + 1]
        servers_init = self.servers[:self.nodes_init]
        servers_out = self.servers[(self.nodes_init - 1) : self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        self.sleep(30)
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*4)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*4)))
        for t in load_thread:
            t.start()
        new_server_list=list(set(new_server_list + servers_in) - set(servers_out))
        self.log.info("==========Rebalance out of 2 nodes and Rebalance In 1 node=========")
        # Rebalance out of 2 nodes and Rebalance In 1 node
        servers_in = [list(set(self.servers) - set(new_server_list))[0]]
        servers_out = list(set(new_server_list) - set([self.master]))[-2:]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*5)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*5)))
        for t in load_thread:
            t.start()
        new_server_list=list(set(new_server_list + servers_in) - set(servers_out))
        self.log.info("==========Rebalance out of 1 nodes and Rebalance In 2 nodes=========")
        #Rebalance out of 1 nodes and Rebalance In 2 nodes
        servers_in = list(set(self.servers) - set(new_server_list))[0:2]
        servers_out = list(set(new_server_list) - set([self.master]))[0:1]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*6)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*6)))
        for t in load_thread:
            t.start()
        new_server_list=list(set(new_server_list + servers_in) - set(servers_out))
        self.log.info("==========Rebalance in 4 nodes =========")
        #Rebalance in 4 nodes
        servers_in = list(set(self.servers) - set(new_server_list))[0:4]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*7)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*7)))
        for t in load_thread:
            t.start()
        new_server_list=list(set(new_server_list + servers_in))
        self.log.info("==========Rebalance out 4 nodes =========")
        #Rebalance out 4 nodes
        servers_out = list(set(new_server_list) - set([self.master]))[0:4]
        rebalance = self.cluster.async_rebalance(servers_init, [], servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*8)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*8)))
        for t in load_thread:
            t.start()
        new_server_list = list(set(new_server_list) - set(servers_out))
        self.log.info("======Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups=========")
        #Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups
        servers_in = list(set(self.servers) - set(new_server_list))[0:4]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*9)
        self.sleep(30)
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 9)))
        for t in load_thread:
            t.start()
        self.shuffle_nodes_between_zones_and_rebalance()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*10)
        self.sleep(30)
        load_thread = []
        for b in bucket:
            load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 10)))
        for t in load_thread:
            t.start()
        self.log.info("======Graceful failover 1 KV node and add back(Delta and Full)=========")
        #Graceful failover 1 KV node and add back(Delta and Full)
        kv_server = self.get_nodes_from_services_map(service_type="kv", get_all_nodes=False)
        fail_over_task = self.cluster.async_failover([self.master], failover_nodes=[kv_server], graceful=True)
        fail_over_task.result()
        self.sleep(120)
        # do a recovery and rebalance
        rest.set_recovery_type('ns_1@' + kv_server.ip, recoveryType=self.recoveryType)
        rest.add_back_node('ns_1@' + kv_server.ip)
        rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], [])
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
         self.check_dataloss(self.master, b,self.num_items*11)
        self.sleep(30)
Ejemplo n.º 31
0
class GSIAutofailover(AutoFailoverBaseTest, BaseSecondaryIndexingTests):
    def setUp(self):
        super(GSIAutofailover, self).setUp()
        self.log.info(
            "==============  GSIAutofailover setup has started ==============")
        self.rest.delete_all_buckets()
        self.index_field_set = powerset([
            'age', 'city', 'country', 'title', 'firstName', 'lastName',
            'streetAddress', 'suffix', 'filler1', 'phone', 'zipcode'
        ])
        if self.failover_orchestrator:
            self.master = self.servers[1]
            self.rest = RestConnection(self.master)
        self.log.info(
            "==============  GSIAutofailover setup has completed =============="
        )

    def tearDown(self):
        self.log.info(
            "==============  GSIAutofailover tearDown has started =============="
        )
        super(GSIAutofailover, self).tearDown()
        self.log.info(
            "==============  GSIAutofailover tearDown has completed =============="
        )

    def suite_tearDown(self):
        pass

    def suite_setUp(self):
        pass

    def _create_indexes(self):
        n1ql_node = self.get_nodes_from_services_map(service_type="n1ql",
                                                     get_all_nodes=False)
        for collection_namespace in self.namespaces:
            for item, index_field in zip(range(self.initial_index_num),
                                         self.index_field_set):
                idx = f'idx_{item}'
                index_gen = QueryDefinition(index_name=idx,
                                            index_fields=index_field)
                query = index_gen.generate_index_create_query(
                    namespace=collection_namespace,
                    num_replica=self.num_index_replicas)
                self.run_cbq_query(query=query, server=n1ql_node)

    def is_failover_expected(self, failure_node_number):
        failover_not_expected = (
            self.max_count == 1 and failure_node_number > 1
            and self.pause_between_failover_action < self.timeout
            or self.num_index_replicas < 1)
        failover_not_expected = failover_not_expected or (
            1 < self.max_count < failure_node_number
            and self.pause_between_failover_action < self.timeout
            or self.num_index_replicas < failure_node_number)
        return not failover_not_expected

    def gsi_multi_node_failover(self):
        servers_to_fail = self.server_to_fail
        for i in range(self.max_count):
            self.server_to_fail = [servers_to_fail[i]]
            self.failover_expected = self.is_failover_expected(i + 1)
            self.failover_actions[self.failover_action](self)

    def test_gsi_auto_failover(self):
        self.bucket_params = self._create_bucket_params(
            server=self.master,
            size=self.bucket_size,
            replicas=self.num_replicas,
            bucket_type=self.bucket_type,
            enable_replica_index=self.enable_replica_index,
            eviction_policy=self.eviction_policy,
            lww=self.lww)
        self.cluster.create_standard_bucket(name=self.test_bucket,
                                            port=11222,
                                            bucket_params=self.bucket_params)
        self.buckets = self.rest.get_buckets()
        self.prepare_collection_for_indexing(num_of_docs_per_collection=10**5)
        self._create_indexes()
        self.enable_autofailover_and_validate()
        self.sleep(5)
        if self.max_count > 1:
            self.gsi_multi_node_failover()
        else:
            self.failover_actions[self.failover_action](self)
        try:
            self.disable_autofailover_and_validate()
        except Exception as err:
            pass

    def test_failed_rebalance_with_gsi_autofailover(self):
        self.bucket_params = self._create_bucket_params(
            server=self.master,
            size=self.bucket_size,
            replicas=self.num_replicas,
            bucket_type=self.bucket_type,
            enable_replica_index=self.enable_replica_index,
            eviction_policy=self.eviction_policy,
            lww=self.lww)
        self.cluster.create_standard_bucket(name=self.test_bucket,
                                            port=11222,
                                            bucket_params=self.bucket_params)
        self.buckets = self.rest.get_buckets()
        self.prepare_collection_for_indexing(num_of_docs_per_collection=10**5)
        self._create_indexes()
        # enable auto failover
        self.enable_autofailover_and_validate()
        # Start rebalance in
        rebalance_task = self.cluster.async_rebalance(
            servers=self.servers,
            to_add=self.servers_to_add,
            to_remove=self.servers_to_remove,
            services=['kv', 'index'])
        self.sleep(20)
        reached = RestHelper(self.rest).rebalance_reached(percentage=20)
        self.assertTrue(reached,
                        "Rebalance failed or did not reach {0}%".format(20))
        # Do a fail over action - reboot, hang, kill. This is defined in the conf file. Test sometimes fail
        # because the rebalance action is completed fast and there's no way to induce a failure.
        self.failover_actions[self.failover_action](self)
        try:
            rebalance_task.result()
        except Exception as err:
            self.log.info("Rebalance failed with : {0}".format(str(err)))
            if "Rebalance failed. See logs for detailed reason. You can try again" in str(
                    err):
                self.log.info(
                    "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}"
                    .format(str(err)))
            elif not RestHelper(self.rest).is_cluster_rebalanced():
                if self._auto_failover_message_present_in_logs(
                        self.server_to_fail[0].ip):
                    self.log.info(
                        "Rebalance interrupted due to auto-failover of nodes - message was seen in logs"
                    )
                else:
                    self.fail(
                        "Rebalance interrupted message was not seen in logs")
            else:
                self.fail("Rebalance was not aborted by auto fail-over")
        self.disable_autofailover_and_validate()

    def test_autofailover_and_addback_of_node(self):
        """
        Test autofailover of nodes and then addback of the node after failover
        1. Enable autofailover and validate
        2. Fail a node and validate if node is failed over if required
        3. Addback node and validate that the addback was successful.
        4. Failover the same node again.
        :return: Nothing
        """
        self.bucket_params = self._create_bucket_params(
            server=self.master,
            size=self.bucket_size,
            replicas=self.num_replicas,
            bucket_type=self.bucket_type,
            enable_replica_index=self.enable_replica_index,
            eviction_policy=self.eviction_policy,
            lww=self.lww)
        self.cluster.create_standard_bucket(name=self.test_bucket,
                                            port=11222,
                                            bucket_params=self.bucket_params)
        self.buckets = self.rest.get_buckets()
        self.prepare_collection_for_indexing(
            num_of_docs_per_collection=self.num_of_docs_per_collection)
        self._create_indexes()
        self.enable_autofailover_and_validate()
        self.sleep(5)
        self.failover_actions[self.failover_action](self)
        self.bring_back_failed_nodes_up()
        self.sleep(30)
        self.log.info(self.server_to_fail[0])
        self.nodes = self.rest.node_statuses()
        self.log.info(self.nodes[0].id)
        self.rest.add_back_node("ns_1@{}".format(self.server_to_fail[0].ip))
        self.rest.set_recovery_type(
            "ns_1@{}".format(self.server_to_fail[0].ip),
            self.recovery_strategy)
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes])
        msg = "rebalance failed while recovering failover nodes {0}".format(
            self.server_to_fail[0])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg)
        self.failover_actions[self.failover_action](self)
        try:
            self.disable_autofailover_and_validate()
        except Exception as err:
            pass
Ejemplo n.º 32
0
 def adding_back_a_node(self, master, server):
     rest = RestConnection(master)
     nodes = rest.node_statuses()
     for node in nodes:
         if server.ip == node.ip and int(server.port) == int(node.port):
             rest.add_back_node(node.id)
Ejemplo n.º 33
0
class RebalanceProgressTests(RebalanceBaseTest):
    def setUp(self):
        super(RebalanceProgressTests, self).setUp()
        self.rest = RestConnection(self.master)
        self.num_views = self.input.param("num_views", 3)
        if self.num_views:
            self._create_indexes()

    def tearDown(self):
        super(RebalanceProgressTests, self).tearDown()

    def test_progress_rebalance_in(self):
        servers_in = self.servers[self.nodes_init:self.nodes_init +
                                  self.nodes_in]
        servers_init = self.servers[:self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()

        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal and docsTransferred should be 0 in added nodes
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_in,
                              previous_stats,
                              new_stats,
                              "outgoing",
                              docs_total=0,
                              docs_transf=0)
            self._check_stats(servers_in, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_init,
                              previous_stats,
                              new_stats,
                              "ingoing",
                              docs_total=0,
                              docs_transf=0)
            self._check_stats(servers_init, previous_stats, new_stats,
                              "outgoing")
            #sum of sending and receiving vbuckets should coincide
            self._check_vb_sums(servers_init, servers_in, new_stats)
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(10)
        rebalance.result()

    def test_progress_rebalance_out(self):
        with_failover = self.input.param("with_failover", False)
        servers_init = self.servers[:self.nodes_init]
        servers_out = self.servers[(self.nodes_init -
                                    self.nodes_out):self.nodes_init]

        if with_failover:
            self.cluster.failover(servers_init, servers_out)
        rebalance = self.cluster.async_rebalance(servers_init, [], servers_out)
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_init, previous_stats, new_stats,
                              "ingoing")
            self._check_stats(servers_init, previous_stats, new_stats,
                              "outgoing")
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_rebalance_swap(self):
        if self.nodes_in != self.nodes_out:
            self.fail("nodes_in != nodes_out. Not a swap rebalance")
        if len(self.servers) < (self.nodes_init + self.nodes_in):
            self.log.error("Not enough VMs!")
            return
        servers_in = self.servers[self.nodes_init:self.nodes_init +
                                  self.nodes_in]
        servers_init = self.servers[:self.nodes_init]
        servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)]
        servers_out = self.servers[(self.nodes_init -
                                    self.nodes_out):self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in,
                                                 servers_out)
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal and docsTransferred should be 0 in added nodes
            #no vbuckets moving for unchanged nodes
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_in,
                              previous_stats,
                              new_stats,
                              "outgoing",
                              docs_total=0,
                              docs_transf=0)
            self._check_stats(servers_in, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_unchanged,
                              previous_stats,
                              new_stats,
                              "ingoing",
                              active_vb=0,
                              replica_vb=0)
            self._check_stats(servers_unchanged,
                              previous_stats,
                              new_stats,
                              "outgoing",
                              active_vb=0,
                              replica_vb=0)
            self._check_stats(servers_out, previous_stats, new_stats,
                              "outgoing")

            #sum of sending and receiving vbuckets should coincide
            self._check_vb_sums(servers_in, servers_out, new_stats)
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_add_back_after_failover(self):
        servers_init = self.servers[:self.nodes_init]
        servers_failover = self.servers[(self.nodes_init -
                                         self.nodes_out):self.nodes_init]
        servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)]
        nodes_all = self.rest.node_statuses()

        failover_nodes = []
        for failover_server in servers_failover:
            failover_nodes.extend(filter(lambda node: node.ip == failover_server.ip and \
                                         str(node.port) == failover_server.port, nodes_all))
        self.cluster.failover(servers_init, servers_failover)
        self.sleep(30)
        for node in failover_nodes:
            self.rest.add_back_node(node.id)

        rebalance = self.cluster.async_rebalance(servers_init, [], [])
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_unchanged, previous_stats, new_stats,
                              "outgoing")
            self._check_stats(servers_failover, previous_stats, new_stats,
                              "ingoing")
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def _check_vb_sums(self, servers_ingoing, servers_outgoing, new_stats):
        active_vb_sum_1 = sum([
            new_stats[server.ip]["ingoing"]['activeVBucketsLeft']
            for server in servers_ingoing
        ])
        active_vb_sum_2 = sum([
            new_stats[server.ip]["outgoing"]['activeVBucketsLeft']
            for server in servers_outgoing
        ])
        self.assertTrue(
            active_vb_sum_1 == active_vb_sum_2,
            "Active vbuckets left should be equal in servers_in and init. %s" %
            new_stats)

    def _check_stats(self,
                     servers,
                     previous_stats,
                     new_stats,
                     type,
                     docs_total=None,
                     docs_transf=None,
                     active_vb=None,
                     replica_vb=None):
        self.assertTrue(
            new_stats["buckets_count"] == len(self.buckets),
            "Expected buckets %s. Actual stat %s" %
            (len(self.buckets), new_stats))
        for server in servers:
            current_stat = new_stats[server.ip][type]
            previous_stat = previous_stats[server.ip][type]
            if new_stats["bucket"] != previous_stats["bucket"]:
                self.assertTrue(
                    current_stat['activeVBucketsLeft'] >=
                    previous_stat['activeVBucketsLeft'],
                    "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
                self.assertTrue(
                    current_stat['replicaVBucketsLeft'] >=
                    previous_stat['replicaVBucketsLeft'],
                    "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
            else:
                self.assertTrue(
                    current_stat['activeVBucketsLeft'] <=
                    previous_stat['activeVBucketsLeft'],
                    "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
                self.assertTrue(
                    current_stat['replicaVBucketsLeft'] <=
                    previous_stat['replicaVBucketsLeft'],
                    "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
                try:
                    if current_stat['docsTotal'] != previous_stat['docsTotal']:
                        self.log.warn(
                            "docsTotal for node %s changed! Previous stat %s. Actual: %s"
                            % (server.ip, current_stat, previous_stat))
                except Exception, ex:
                    if previous_stat['docsTotal'] != 0 and current_stat[
                            'docsTotal'] == 0:
                        command = "sys:get_status({global, ns_rebalance_observer})."
                        self.log.info("posting: %s" % command)
                        self.rest.diag_eval(command)
                    raise ex
                self.assertTrue(
                    current_stat['docsTransferred'] >=
                    previous_stat['docsTransferred'],
                    "docsTransferred for node %s decreased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
            if docs_total is not None:
                self.assertTrue(
                    current_stat['docsTotal'] == docs_total,
                    "DocTotal for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['docsTotal'], docs_total,
                     current_stat))
            if docs_transf is not None:
                self.assertTrue(
                    current_stat['docsTransferred'] == docs_transf,
                    "docsTransferred for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['docsTotal'], docs_transf,
                     current_stat))
            if active_vb is not None:
                self.assertTrue(
                    current_stat['activeVBucketsLeft'] == active_vb,
                    "docsTransferred for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['activeVBucketsLeft'], active_vb,
                     current_stat))
            if replica_vb is not None:
                self.assertTrue(
                    current_stat['replicaVBucketsLeft'] == replica_vb,
                    "docsTransferred for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['activeVBucketsLeft'], active_vb,
                     current_stat))
            self.log.info("Checked stat: %s" % new_stats)
Ejemplo n.º 34
0
    def cluster_nodes_write(self,username,password,host,port=8091, servers=None,cluster=None,httpCode=None,user_role=None):
        try:
            _cluster_nodes_write = {
                "ejectNode":"/controller/ejectNode;POST",
                #"addNode":"/controller/addNode;POST",
                #"addNodeV2":"/controller/addNodeV2;POST",
                #"uuidAddNode":"pools/default/serverGroups/<uuid>/addNode;POST",
                #"uiidAddNodev1":"/pools/default/serverGroups/<uuid>/addNodeV2;POST",
                #"failover":"/controller/failOver;POST",
                #"graceFullFailover":"/controller/startGracefulFailover;POST",
                #"rebalance":"/controller/rebalance;POST",
                #"reAddNode":"/controller/reAddNode;POST",
                #"reFailover":"/controller/reFailOver;POST",
                #"stopRebalance":"/controller/stopRebalance;POST",
                #"setRecoveryType":"/controller/setRecoveryType;POST"
            }

            rest = RestConnection(servers[0])
            known_nodes = []


            #Add Node
            params = {'hostname': servers[1].ip,'user': '******','password': '******'}
            add_node = {"addNode":"controller/addNode;POST;" + str(params)}
            result = self._return_http_code(add_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)

            #cluster.rebalance(servers,servers[1],[])
            rest.eject_node("Administrator","password",'ns_1@'+servers[1].ip)

            #cluster.rebalance(servers,[],servers[1:])

            #time.sleep(30)
            #params = {'hostname': servers[1].ip,'user': '******','password': '******'}
            #add_node = {"addNode":"controller/addNodeV2;POST;" + str(params)}
            #result = self._return_http_code(add_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)

            #cluster.rebalance(servers,[],servers[1:])

            time.sleep(30)
            cluster.rebalance(servers,servers[1:],[])
            params = {'otpNode': "ns_1@"+servers[1].ip}
            failover_node = {"failover":"controller/failOver;POST;"+str(params)}
            result = self._return_http_code(failover_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)
            time.sleep(30)
            cluster.rebalance(servers,[],servers[1:])

            time.sleep(15)
            cluster.rebalance(servers,servers[1:],[])
            time.sleep(15)
            params = {'otpNode': "ns_1@"+servers[1].ip}
            grace_failover = {"grace_failover":"controller/startGracefulFailover;POST;"+str(params)}
            result = self._return_http_code(grace_failover,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)
            time.sleep(60)
            rest.set_recovery_type("ns_1@"+servers[1].ip,'delta')
            time.sleep(30)

            rest.add_back_node("ns_1@"+servers[1].ip)

            time.sleep(30)
            serv_out = 'ns_1@' + servers[2].ip
            rest.fail_over(serv_out,graceful=False)
            time.sleep(15)
            params = {'otpNode': "ns_1@"+servers[2].ip}
            radd_node = {"reAddNode":"controller/reAddNode;POST;"+ str(params)}
            result = self._return_http_code(radd_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)


            time.sleep(30)

            #serv_out = 'ns_1@' + servers[3].ip
            #rest.fail_over(serv_out,graceful=False)
            #params = {'otpNode': "ns_1@"+servers[3].ip}
            #radd_node = {"reFailOver":"controller/reFailOver;POST;"+ str(params)}
            #result = self._return_http_code(radd_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)

            cluster.rebalance(servers,[],servers[1:])

            time.sleep(30)
            cluster.rebalance(servers,servers[1:],[])
            time.sleep(30)
            serv_out = 'ns_1@' + servers[1].ip
            rest.fail_over(serv_out,graceful=True)
            time.sleep(60)
            params = {'otpNode': 'ns_1@'+servers[1].ip,'recoveryType': 'delta'}
            recovery_type = {"setRecoveryType":"controller/setRecoveryType;POST;"+ str(params)}
            result = self._return_http_code(recovery_type,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role)
            cluster.rebalance(servers)
        except:
            log.info ("Issue with rebalance, going to next test case")
            cluster.rebalance(servers,[],servers[1:])
            for server in servers:
                rest = RestConnection(server)
                rest.init_cluster(username='******', password='******')
                rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
Ejemplo n.º 35
0
 def adding_back_a_node(self, master, server):
     rest = RestConnection(master)
     nodes = rest.node_statuses()
     for node in nodes:
         if server.ip == node.ip and int(server.port) == int(node.port):
             rest.add_back_node(node.id)
Ejemplo n.º 36
0
    def common_test_body(self, keys_count, replica, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(replica))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master,
                               self.gen_create,
                               "create",
                               0,
                               batch_size=10000,
                               pause_secs=5,
                               timeout_secs=180)
        self._wait_for_stats_all_buckets(self._servers)

        _servers_ = self._servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        self._wait_for_replication(self._servers, timeout=600)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=replica)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(rest).wait_for_node_status(
                        node, "unhealthy", 300),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                RemoteUtilHelper.enable_firewall(
                    self._servers, node, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(
                    node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(
                        node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self._servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            o, r = shell.execute_command(
                                "/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    self.assertTrue(
                        status,
                        msg=
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info(
                    "unable to failover the node the first time. try again in  60 seconds.."
                )
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(
                failed_over,
                "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                        _servers_.remove(server)
                        self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        self._wait_for_stats_all_buckets(_servers_)
        self._wait_for_replication(self._servers, timeout=600)
        self._verify_stats_all_buckets(_servers_)
        self._verify_all_buckets(self.master)
Ejemplo n.º 37
0
    def replicate_correct_data_after_rollback(self):

        NUMBER_OF_DOCS = 10000

        # populate the kvs, they will look like ...
        """
        key: keyname-x
        value:
          {
          "mutated": 0,
            "_id": "keyname-x",
             "val-field-name": "serial-vals-100"
            }
        """
        vals = ['serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS)]
        template = '{{ "val-field-name": "{0}"  }}'
        gen_load = DocumentGenerator('keyname',
                                     template,
                                     vals,
                                     start=0,
                                     end=NUMBER_OF_DOCS)

        rc = self.cluster.load_gen_docs(self.servers[0],
                                        self.buckets[0].name,
                                        gen_load,
                                        self.buckets[0].kvs[1],
                                        "create",
                                        exp=0,
                                        flag=0,
                                        batch_size=1000)

        # store the KVs which were modified and active on node 1
        modified_kvs_active_on_node1 = {}
        vbucket_client = VBucketAwareMemcached(RestConnection(self.master),
                                               'default')
        client = MemcachedClientHelper.direct_client(self.servers[0],
                                                     'default')
        for i in range(NUMBER_OF_DOCS / 100):
            keyname = 'keyname-' + str(i)
            vbId = ((zlib.crc32(keyname) >> 16) & 0x7fff) & (self.vbuckets - 1)
            if vbucket_client.vBucketMap[vbId].split(
                    ':')[0] == self.servers[0].ip:
                rc = client.get(keyname)
                modified_kvs_active_on_node1[keyname] = rc[2]

        # stop persistence
        for bucket in self.buckets:
            for s in self.servers[:self.nodes_init]:
                client = MemcachedClientHelper.direct_client(s, bucket)
                try:
                    client.stop_persistence()
                except MemcachedError as e:
                    if self.bucket_type == 'ephemeral':
                        self.assertTrue(
                            "Memcached error #4 'Invalid':  Flusher not running. for vbucket :0 to mc "
                            in e.message)
                        return
                    else:
                        raise

        # modify less than 1/2 of the keys
        vals = [
            'modified-serial-vals-' + str(i)
            for i in xrange(NUMBER_OF_DOCS / 100)
        ]
        template = '{{ "val-field-name": "{0}"  }}'
        gen_load = DocumentGenerator('keyname',
                                     template,
                                     vals,
                                     start=0,
                                     end=NUMBER_OF_DOCS / 100)
        rc = self.cluster.load_gen_docs(self.servers[0],
                                        self.buckets[0].name,
                                        gen_load,
                                        self.buckets[0].kvs[1],
                                        "create",
                                        exp=0,
                                        flag=0,
                                        batch_size=1000)

        # kill memcached, when it comes back because persistence is disabled it will have lost the second set of mutations
        shell = RemoteMachineShellConnection(self.servers[0])
        shell.kill_memcached()
        time.sleep(10)

        # start persistence on the second node
        client = MemcachedClientHelper.direct_client(self.servers[1],
                                                     'default')
        client.start_persistence()

        time.sleep(5)

        # failover to the second node
        rc = self.cluster.failover(self.servers,
                                   self.servers[1:2],
                                   graceful=True)
        time.sleep(30)  # give time for the failover to complete

        # check the values, they should be what they were prior to the second update
        client = MemcachedClientHelper.direct_client(self.servers[0],
                                                     'default')
        for k, v in modified_kvs_active_on_node1.iteritems():
            rc = client.get(k)
            self.assertTrue(v == rc[2],
                            'Expected {0}, actual {1}'.format(v, rc[2]))

        # need to rebalance the node back into the cluster
        # def rebalance(self, servers, to_add, to_remove, timeout=None, use_hostnames=False, services = None):
        rest_obj = RestConnection(self.servers[0])
        node_id_for_recovery = "ns_1@" + self.servers[1].ip
        status = rest_obj.add_back_node(node_id_for_recovery)
        if status:
            rest_obj.set_recovery_type(node_id_for_recovery,
                                       recoveryType='delta')
        rc = self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
Ejemplo n.º 38
0
    def replicate_correct_data_after_rollback(self):
        '''
        @attention: This test case has some issue with docker runs. It
        passes without any issue on VMs.
        '''

        NUMBER_OF_DOCS = 10000


        # populate the kvs, they will look like ...
        """
        key: keyname-x
        value:
          {
          "mutated": 0,
            "_id": "keyname-x",
             "val-field-name": "serial-vals-100"
            }
        """
        vals = ['serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS)]
        template = '{{ "val-field-name": "{0}"  }}'
        gen_load = DocumentGenerator('keyname', template, vals, start=0,
                                     end=NUMBER_OF_DOCS)

        rc = self.cluster.load_gen_docs(self.servers[0], self.buckets[0].name, gen_load,
                                   self.buckets[0].kvs[1], "create", exp=0, flag=0, batch_size=1000)

        # store the KVs which were modified and active on node 1
        modified_kvs_active_on_node1 = {}
        vbucket_client = VBucketAwareMemcached(RestConnection(self.master), 'default')
        client = MemcachedClientHelper.direct_client(self.servers[0], 'default')
        for i in range(NUMBER_OF_DOCS/100):
            keyname = 'keyname-' + str(i)
            vbId = ((zlib.crc32(keyname) >> 16) & 0x7fff) & (self.vbuckets- 1)
            if vbucket_client.vBucketMap[vbId].split(':')[0] == self.servers[0].ip:
                rc = client.get( keyname )
                modified_kvs_active_on_node1[ keyname ] = rc[2]

        # stop persistence
        for bucket in self.buckets:
            for s in self.servers[:self.nodes_init]:
                client = MemcachedClientHelper.direct_client(s, bucket)
                try:
                    client.stop_persistence()
                except MemcachedError as e:
                    if self.bucket_type == 'ephemeral':
                        self.assertTrue(
                            "Memcached error #4 'Invalid':  Flusher not running. for vbucket :0 to mc " in e.message)
                        return
                    else:
                        raise

        # modify less than 1/2 of the keys
        vals = ['modified-serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS/100)]
        template = '{{ "val-field-name": "{0}"  }}'
        gen_load = DocumentGenerator('keyname', template, vals, start=0,
                                     end=NUMBER_OF_DOCS/100)
        rc = self.cluster.load_gen_docs(self.servers[0], self.buckets[0].name, gen_load,
                                   self.buckets[0].kvs[1], "create", exp=0, flag=0, batch_size=1000)

        # kill memcached, when it comes back because persistence is disabled it will have lost the second set of mutations
        shell = RemoteMachineShellConnection(self.servers[0])
        shell.kill_memcached()
        time.sleep(10)

        # start persistence on the second node
        client = MemcachedClientHelper.direct_client(self.servers[1], 'default')
        client.start_persistence()

        time.sleep(5)

        # failover to the second node
        rc = self.cluster.failover(self.servers, self.servers[1:2], graceful=True)
        time.sleep(30)     # give time for the failover to complete

        # check the values, they should be what they were prior to the second update
        client = MemcachedClientHelper.direct_client(self.servers[0], 'default')
        for k,v  in modified_kvs_active_on_node1.iteritems():
            rc = client.get( k )
            self.assertTrue( v == rc[2], 'Expected {0}, actual {1}'.format(v, rc[2]))

        # need to rebalance the node back into the cluster
        # def rebalance(self, servers, to_add, to_remove, timeout=None, use_hostnames=False, services = None):

        rest_obj = RestConnection(self.servers[0])
        nodes_all = rest_obj.node_statuses()
        for node in nodes_all:
            if node.ip == self.servers[1].ip:
                break

        node_id_for_recovery = node.id
        status = rest_obj.add_back_node(node_id_for_recovery)
        if status:
            rest_obj.set_recovery_type(node_id_for_recovery,
                                       recoveryType='delta')
        rc = self.cluster.rebalance(self.servers[:self.nodes_init], [],[])
Ejemplo n.º 39
0
    def test_backwards_compatability(self):
        create_index_query = "CREATE INDEX idx_name ON {0}(name)".format(
            self.bucket_name)
        self.n1ql_helper.run_cbq_query(query=create_index_query,
                                       server=self.n1ql_node)
        create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format(
            self.bucket_name)
        self.n1ql_helper.run_cbq_query(query=create_index_query,
                                       server=self.n1ql_node)
        self.wait_until_indexes_online()

        result = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where name = "employee-9"'.format(
                self.bucket_name))
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where join_day = 9'.format(
                self.bucket_name))
        self.assertEqual(result2['metrics']['resultCount'], 72)

        upgrade_nodes = self.servers[:self.nodes_init]

        for server in upgrade_nodes:
            remote = RemoteMachineShellConnection(server)
            remote.stop_server()
            remote.disconnect()
            upgrade_threads = self._async_update(self.upgrade_to, [server])
            for upgrade_thread in upgrade_threads:
                upgrade_thread.join()
            self.upgrade_servers.append(server)
        self.sleep(180)
        msg = "Cluster is not healthy after upgrade"
        self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
        self.log.info("Cluster is healthy")
        rest = RestConnection(self.master)
        nodes_all = rest.node_statuses()
        try:
            for cluster_node in nodes_all:
                if cluster_node.ip == self.master.ip:
                    self.log.info("Adding Back: {0}".format(self.master.ip))
                    rest.add_back_node(cluster_node.id)
                    rest.set_recovery_type(otpNode=cluster_node.id,
                                           recoveryType="full")
        except Exception as e:
            self.log.error(str(e))
        self.log.info("Adding node back to cluster...")
        rebalance = self.cluster.async_rebalance(
            self.servers[:self.nodes_init], [], [])
        rebalance.result()
        self.assertTrue(self.wait_until_indexes_online(),
                        "Some indexes are not online")
        self.log.info("All indexes are online")
        self.add_built_in_server_user()
        self.sleep(20)

        try:
            create_index_query = "CREATE INDEX idx_name ON {0}(name)".format(
                self.bucket_name)
            self.n1ql_helper.run_cbq_query(query=create_index_query,
                                           server=self.n1ql_node)
        except Exception as e:
            self.log.info("indexes already exist")
        try:
            create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format(
                self.bucket_name)
            self.n1ql_helper.run_cbq_query(query=create_index_query,
                                           server=self.n1ql_node)
            self.wait_until_indexes_online()
        except Exception as e:
            self.log.info("indexes already exist")

        result = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where name = "employee-9"'.format(
                self.bucket_name))
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM {0} where join_day = 9'.format(
                self.bucket_name))
        self.assertEqual(result2['metrics']['resultCount'], 72)

        result = self.n1ql_helper.run_cbq_query(
            query=
            'SELECT * FROM default:{0}._default._default where name = "employee-9"'
            .format(self.bucket_name))
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(
            query=
            'SELECT * FROM default:{0}._default._default where join_day = 9'.
            format(self.bucket_name))
        self.assertEqual(result2['metrics']['resultCount'], 72)

        result = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM _default where name = "employee-9"',
            query_context='default:{0}._default'.format(self.bucket_name))
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(
            query='SELECT * FROM _default where join_day = 9',
            query_context='default:{0}._default'.format(self.bucket_name))
        self.assertEqual(result2['metrics']['resultCount'], 72)
Ejemplo n.º 40
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp(self)

    def tearDown(self):
        super(FailoverTests, self).tearDown(self)

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case (before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.referenceNode = self.master
        if self.failoverMaster:
            self.referenceNode = self.servers[1]
        self.log.info(" Picking node {0} as reference node for test case".format(self.referenceNode.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.referenceNode)
        self.nodes = self.rest.node_statuses()

        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        self.chosen = RebalanceHelper.pick_nodes(self.referenceNode, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withOps = True => Run Operations in parallel to failover
        # self.withOps = False => Run Operations Before failover
        self.ops_tasks = self.run_operation_tasks()

        # Perform View Creation Tasks and check for completion if required before failover
        if self.runViews:
            self.run_view_creation_operations(self.servers)
            if not self.runViewsDuringFailover:
                self.run_view_creation_operations(self.servers)
                self.monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validaiton
        record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None)
        prev_vbucket_stats = {}
        prev_failover_stats = {}

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets)

        # Perform Operations relalted to failover
        self.run_failover_operations(self.chosen, failover_reason)

        # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons
        if not self.gracefulFailoverFail:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)

    def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        self.sleep(60, "after failover before invoking rebalance...")
        _servers_ = self.filter_servers(self.servers, chosen)
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                               ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.referenceNode.rest_password
                self.change_password(new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.referenceNode)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.referenceNode)
        try:
            # Run operations if required during rebalance after failover
            if self.withOps:
                for task in self.ops_tasks:
                    task.result()
            msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen])
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

            #  Drain Queue and make sure intra-cluster replication is complete
            self._verify_stats_all_buckets(_servers_,timeout = 120)
            self._wait_for_stats_all_buckets(_servers_)

            self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
            # Verify all data set with meta data if failover happens after failover
            if not self.withOps:
                self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None)
            # Check Cluster Stats and Data as well if max_verify > 0
            self.verify_cluster_stats(_servers_, self.referenceNode)
            # If views were created they can be verified
            if self.runViews:
                if self.runViewsDuringFailover:
                    self.monitor_view_tasks(_servers_)
                self.verify_query_task()
            # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
            # Currently, only  for checking case where we  have graceful failover
            if self.version_greater_than_2_5 and self.graceful and self.upr_check:
                new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets)
                new_vbucket_stats =  self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets)
                self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)
            self.log.info("End VERIFICATION for Rebalance after Failover Only")
        finally:
            if self.during_ops:
                if self.during_ops == "change_password":
                    self.change_password(new_password=old_pass)
                elif self.during_ops == "change_port":
                    self.change_port(new_port='8091',
                    current_port=self.input.param("new_port", "9090"))

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        serverMap =  self.get_server_map(self.servers)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap)
        index = 0
        for node in chosen:
            self.rest.add_back_node(node.id)
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                               ejectedNodes=[])
        msg = "rebalance failed while removing failover nodes {0}".format(chosen)
        # Run operations if required during rebalance after failover
        if self.withOps:
            for task in self.ops_tasks:
                task.result()
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        #  Drain ep_queue and make sure that intra-cluster replication is complete
        self._verify_stats_all_buckets(self.servers,timeout = 120)
        self._wait_for_stats_all_buckets(self.servers)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, serverMap, self.buckets,
                recoveryTypeMap, fileMapsForVerification)

        # Comparison of all data if required
        if not self.withOps:
            self.data_analysis_all(record_static_data_set,self.servers, self.buckets,  path = None)

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers, self.referenceNode)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False)
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)

        # Peform View Validation if Supported
        if self.runViews:
            if self.runViewsDuringFailover:
                self.monitor_view_tasks(self.servers)
            self.verify_query_task()
        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.referenceNode))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        for node in chosen:
            if failover_reason == 'stop_server':
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

        # define precondition check for failover
        failed_over = self.rest.fail_over(node.id, graceful=self.graceful)

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and failed_over:
            """ Check if the fail_over fails as expected """
            self.assertTrue(not failed_over,""" Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=self.graceful)
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

    def run_operation_tasks(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks =  []
        tasks += self._async_load_all_buckets(self.referenceNode, self.gen_initial_create, "create", 0)
        for task in tasks:
            task.result()
        self._verify_stats_all_buckets(self.servers,timeout = 120)
        self._wait_for_stats_all_buckets(self.servers)
        # Update or Delete buckets if items > 0 and options are passed in tests
        # These can run in parallel (withOps = True), or before (withOps = True)
        ops_tasks = []
        if("create" in self.doc_ops):
            ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "create", 0)
        if("update" in self.doc_ops):
            ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "update", 0)
        if("delete" in self.doc_ops):
            ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_delete, "delete", 0)
        if not self.withOps:
            for task in ops_tasks:
                task.result()
            self._wait_for_stats_all_buckets(self.servers)
            self._verify_stats_all_buckets(self.servers,timeout = 120)
        return ops_tasks

    def define_maps_during_failover(self, recoveryType = []):
        """ Method to define nope ip, recovery type map """
        recoveryTypeMap={}
        index=0
        for server in self.chosen:
            if recoveryType:
                recoveryTypeMap[server.ip] = recoveryType[index]
            index += 1
        return recoveryTypeMap

    def filter_servers(self, original_servers, filter_servers):
        """ Filter servers that have not failed over """
        _servers_ = copy.deepcopy(original_servers)
        for failed in filter_servers:
            for server in _servers_:
                if server.ip == failed.ip:
                    _servers_.remove(server)
                    self._cleanup_nodes.append(server)
        return _servers_

    def verify_for_recovery_type(self, chosen = [], serverMap = {}, buckets = [], recoveryTypeMap = {}, fileMap = {}):
        """ Verify recovery type is delta or full """
        logic = True
        summary = ""
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            for bucket in buckets:
                path = fileMap[server.ip][bucket.name]
                exists = shell.file_exists(path,"check.txt")
                if recoveryTypeMap[server.ip] == "delta" and not exists:
                    logic = False
                    summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip,bucket.name)
                elif recoveryTypeMap[server.ip] == "full" and exists:
                    logic = False
                    summary += "\n Failed Condition :: node {0}, bucket {1}  :: Expected Full, Actual Delta".format(server.ip,bucket.name)
            shell.disconnect()
        self.assertTrue(logic, summary)

    def run_view_creation_operations(self, servers):
        """" Run view Creation and indexing building tasks on servers """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        num_tries = self.input.param("num_tries", 10)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        query = {}
        query["connectionTimeout"] = 60000;
        query["full_set"] = "true"

        views = []
        tasks = []
        for bucket in self.buckets:
            temp = self.make_default_views(self.default_view_name, num_views,
                                           is_dev_ddoc, different_map= False)
            temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket)
            views += temp
            tasks += temp_tasks

        timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items / 50000)

        for task in tasks:
            task.result(self.wait_timeout * 20)

        for bucket in self.buckets:
                for view in views:
                    # run queries to create indexes
                    self.cluster.query_view(self.master, prefix + ddoc_name, view.name, query)
        self.verify_query_task()
        active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False)
        for active_task in active_tasks:
            result = active_task.result()
            self.assertTrue(result)

    def monitor_view_tasks(self, servers):
        """ Monitor Query Tasks for their completion """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False)
        for active_task in active_tasks:
            result = active_task.result()
            self.assertTrue(result)

    def verify_query_task(self):
        """ Verify Query Results """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        query = {}
        query["connectionTimeout"] = 60000;
        query["full_set"] = "true"
        expected_rows = None
        if self.max_verify:
            expected_rows = self.max_verify
            query["limit"] = expected_rows
        query["stale"] = "false"
        for bucket in self.buckets:
            self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=2400, expected_rows=expected_rows)

    def create_file(self,chosen,buckets,serverMap):
        """ Created files in data paths for checking if delta/full recovery occured """
        fileMap={}
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            map = {}
            for bucket in buckets:
                bucket_data_path=self.data_path+"/"+bucket.name+"/"+"check.txt"
                full_path=self.data_path+"/"+bucket.name+"/"
                map[bucket.name] = full_path
                shell.create_file(bucket_data_path,"check")
            fileMap[server.ip] = map
            shell.disconnect()
        return fileMap

    def get_server_map(self,node):
        """ Map of ips and server information """
        map = {}
        for server in self.servers:
            map[server.ip] = server
        return map

    def stop_server(self, node):
        """ Method to stop a server which is subject to failover """
        for server in self.servers:
            if server.ip == node.ip:
                shell = RemoteMachineShellConnection(server)
                if shell.is_couchbase_installed():
                    shell.stop_couchbase()
                    self.log.info("Couchbase stopped")
                else:
                    shell.stop_membase()
                    self.log.info("Membase stopped")
                shell.disconnect()
                break
Ejemplo n.º 41
0
    def test_backwards_compatability_prepared(self):
        create_index_query = "CREATE INDEX idx_name ON {0}(name)".format(
            self.bucket_name)
        self.n1ql_helper.run_cbq_query(query=create_index_query,
                                       server=self.n1ql_node)
        create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format(
            self.bucket_name)
        self.n1ql_helper.run_cbq_query(query=create_index_query,
                                       server=self.n1ql_node)
        self.wait_until_indexes_online()

        self.n1ql_helper.run_cbq_query(
            query='PREPARE p1 as SELECT * FROM {0} where name = "employee-9"'.
            format(self.bucket_name))
        result = self.n1ql_helper.run_cbq_query(query='EXECUTE p1')
        self.assertEqual(result['metrics']['resultCount'], 72)

        self.n1ql_helper.run_cbq_query(
            query='PREPARE p2 as SELECT * FROM {0} where join_day = 9'.format(
                self.bucket_name))
        result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p2')
        self.assertEqual(result2['metrics']['resultCount'], 72)

        upgrade_nodes = self.servers[:self.nodes_init]

        for server in upgrade_nodes:
            remote = RemoteMachineShellConnection(server)
            remote.stop_server()
            remote.disconnect()
            upgrade_threads = self._async_update(self.upgrade_to, [server])
            for upgrade_thread in upgrade_threads:
                upgrade_thread.join()
            self.upgrade_servers.append(server)
        self.sleep(180)
        msg = "Cluster is not healthy after upgrade"
        self.assertTrue(self.wait_until_cluster_is_healthy(), msg)
        self.log.info("Cluster is healthy")
        rest = RestConnection(self.master)
        nodes_all = rest.node_statuses()
        try:
            for cluster_node in nodes_all:
                if cluster_node.ip == self.master.ip:
                    self.log.info("Adding Back: {0}".format(self.master.ip))
                    rest.add_back_node(cluster_node.id)
                    rest.set_recovery_type(otpNode=cluster_node.id,
                                           recoveryType="full")
        except Exception as e:
            self.log.error(str(e))
        self.log.info("Adding node back to cluster...")
        rebalance = self.cluster.async_rebalance(
            self.servers[:self.nodes_init], [], [])
        rebalance.result()
        self.assertTrue(self.wait_until_indexes_online(),
                        "Some indexes are not online")
        self.log.info("All indexes are online")
        self.add_built_in_server_user()
        self.sleep(20)

        try:
            create_index_query = "CREATE INDEX idx_name ON {0}(name)".format(
                self.bucket_name)
            self.n1ql_helper.run_cbq_query(query=create_index_query,
                                           server=self.n1ql_node)
        except Exception as e:
            self.log.info("indexes already exist")
        try:
            create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format(
                self.bucket_name)
            self.n1ql_helper.run_cbq_query(query=create_index_query,
                                           server=self.n1ql_node)
            self.wait_until_indexes_online()
        except Exception as e:
            self.log.info("indexes already exist")

        # Make sure we are able to create prepared statements after the upgrade on default bucket
        try:
            self.n1ql_helper.run_cbq_query(
                query=
                'PREPARE p3 as SELECT * FROM {0}.`_default`.`_default` where name = "employee-9"'
                .format(self.bucket_name))
        except Exception as e:
            self.log.info(
                "Let's try prepare again in case could not find scope on first try"
            )
            self.n1ql_helper.run_cbq_query(
                query=
                'PREPARE p3 as SELECT * FROM {0}.`_default`.`_default` where name = "employee-9"'
                .format(self.bucket_name))
        try:
            self.n1ql_helper.run_cbq_query(
                query=
                'PREPARE p4 as SELECT * FROM {0}.`_default`.`_default` where join_day = 9'
                .format(self.bucket_name))
        except Exception as e:
            self.log.info(
                "Let's try prepare again in case could not find scope onf first try"
            )
            self.n1ql_helper.run_cbq_query(
                query=
                'PREPARE p4 as SELECT * FROM {0}.`_default`.`_default` where join_day = 9'
                .format(self.bucket_name))

        result = self.n1ql_helper.run_cbq_query(query='EXECUTE p1')
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p2')
        self.assertEqual(result2['metrics']['resultCount'], 72)

        result = self.n1ql_helper.run_cbq_query(query='EXECUTE p3')
        self.assertEqual(result['metrics']['resultCount'], 72)

        result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p4')
        self.assertEqual(result2['metrics']['resultCount'], 72)

        self.n1ql_helper.create_scope(server=self.master,
                                      bucket_name=self.bucket_name,
                                      scope_name="test")
        self.n1ql_helper.create_collection(server=self.master,
                                           bucket_name=self.bucket_name,
                                           scope_name="test",
                                           collection_name="test1")
        self.n1ql_helper.create_collection(server=self.master,
                                           bucket_name=self.bucket_name,
                                           scope_name="test",
                                           collection_name="test2")

        self.n1ql_helper.run_cbq_query(query=(
            'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) +
            '(KEY, VALUE) VALUES ("key2", { "type" : "hotel", "name" : "new hotel" })'
        ))
        self.n1ql_helper.run_cbq_query(query=(
            'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) +
            '(KEY, VALUE) VALUES ("key1", { "type" : "hotel", "name" : "old hotel" })'
        ))
        time.sleep(20)

        self.n1ql_helper.run_cbq_query(
            query="CREATE INDEX idx1 on default:{0}.test.test1(name) ".format(
                self.bucket_name))
        time.sleep(20)

        #Create a prepared statement on a collection and make sure this works post upgrade
        self.n1ql_helper.run_cbq_query(
            query=
            'PREPARE p5 as SELECT * FROM {0}.test.test1 where name = "new hotel"'
            .format(self.bucket_name))
        result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p5')
        self.assertEqual(result2['metrics']['resultCount'], 1)
Ejemplo n.º 42
0
    def test_volume_with_rebalance(self):
        self.src_bucket = RestConnection(self.master).get_buckets()
        rest = RestConnection(self.master)
        bucket = rest.get_buckets()
        # for bk in bucket:
        #     rest.flush_bucket(bk)
        #self.sleep(30)
        #load initial documents
        self.create_ddocs_and_views()
        load_thread = []
        import Queue
        queue = Queue.Queue()
        for b in bucket:
            load_thread.append(
                Thread(target=lambda q, args1, args2, args3: q.put(
                    self.load(args1, args2, args3)),
                       args=(queue, self.master, self.num_items, b)))
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b)))
        for t in load_thread:
            t.start()
        servers_init = self.servers[:self.nodes_init]
        new_server_list = self.servers[0:self.nodes_init]
        for t in load_thread:
            t.join()
        self.sleep(30)
        #Reload more data for mutations
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b, self.num_items)))
        for t in load_thread:
            t.start()
        # #Rebalance in 1 node
        self.log.info("==========rebalance in 1 node=========")
        servers_in = self.servers[self.nodes_init:self.nodes_init + 1]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])

        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 2)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 2)))
        for t in load_thread:
            t.start()
        #rebalance out 1 node
        new_server_list = self.servers[0:self.nodes_init] + servers_in
        self.log.info("==========rebalance out 1 node=========")
        servers_out = [self.servers[self.nodes_init]]
        rebalance = self.cluster.async_rebalance(servers_init, [], servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 3)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 3)))
        for t in load_thread:
            t.start()
        new_server_list = list(set(new_server_list) - set(servers_out))
        #swap rebalance 1 node
        self.log.info("==========swap rebalance 1 node=========")
        servers_in = self.servers[self.nodes_init:self.nodes_init + 1]
        servers_init = self.servers[:self.nodes_init]
        servers_out = self.servers[(self.nodes_init - 1):self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in,
                                                 servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        self.sleep(30)
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 4)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 4)))
        for t in load_thread:
            t.start()
        new_server_list = list(
            set(new_server_list + servers_in) - set(servers_out))
        self.log.info(
            "==========Rebalance out of 2 nodes and Rebalance In 1 node========="
        )
        # Rebalance out of 2 nodes and Rebalance In 1 node
        servers_in = [list(set(self.servers) - set(new_server_list))[0]]
        servers_out = list(set(new_server_list) - set([self.master]))[-2:]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in,
                                                 servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 5)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 5)))
        for t in load_thread:
            t.start()
        new_server_list = list(
            set(new_server_list + servers_in) - set(servers_out))
        self.log.info(
            "==========Rebalance out of 1 nodes and Rebalance In 2 nodes========="
        )
        #Rebalance out of 1 nodes and Rebalance In 2 nodes
        servers_in = list(set(self.servers) - set(new_server_list))[0:2]
        servers_out = list(set(new_server_list) - set([self.master]))[0:1]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in,
                                                 servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 6)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 6)))
        for t in load_thread:
            t.start()
        new_server_list = list(
            set(new_server_list + servers_in) - set(servers_out))
        self.log.info("==========Rebalance in 4 nodes =========")
        #Rebalance in 4 nodes
        servers_in = list(set(self.servers) - set(new_server_list))[0:4]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 7)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 7)))
        for t in load_thread:
            t.start()
        new_server_list = list(set(new_server_list + servers_in))
        self.log.info("==========Rebalance out 4 nodes =========")
        #Rebalance out 4 nodes
        servers_out = list(set(new_server_list) - set([self.master]))[0:4]
        rebalance = self.cluster.async_rebalance(servers_init, [], servers_out)
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 8)
        self.sleep(30)
        # load more document
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 8)))
        for t in load_thread:
            t.start()
        new_server_list = list(set(new_server_list) - set(servers_out))
        self.log.info(
            "======Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups========="
        )
        #Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups
        servers_in = list(set(self.servers) - set(new_server_list))[0:4]
        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 9)
        self.sleep(30)
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 9)))
        for t in load_thread:
            t.start()
        self.shuffle_nodes_between_zones_and_rebalance()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 10)
        self.sleep(30)
        load_thread = []
        for b in bucket:
            load_thread.append(
                Thread(target=self.load,
                       args=(self.master, self.num_items, b,
                             self.num_items * 10)))
        for t in load_thread:
            t.start()
        self.log.info(
            "======Graceful failover 1 KV node and add back(Delta and Full)========="
        )
        #Graceful failover 1 KV node and add back(Delta and Full)
        kv_server = self.get_nodes_from_services_map(service_type="kv",
                                                     get_all_nodes=False)
        fail_over_task = self.cluster.async_failover(
            [self.master], failover_nodes=[kv_server], graceful=True)
        fail_over_task.result()
        self.sleep(120)
        # do a recovery and rebalance
        rest.set_recovery_type('ns_1@' + kv_server.ip,
                               recoveryType=self.recoveryType)
        rest.add_back_node('ns_1@' + kv_server.ip)
        rebalance = self.cluster.async_rebalance(
            self.servers[:self.nodes_init], [], [])
        rebalance.result()
        for t in load_thread:
            t.join()
        for b in bucket:
            self.check_dataloss(self.master, b, self.num_items * 11)
        self.sleep(30)
Ejemplo n.º 43
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                RemoteUtilHelper.enable_firewall(self.servers, node, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("30 seconds sleep after failover before invoking rebalance...")
            time.sleep(30)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                         _servers_.remove(server)
                         self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        RebalanceHelper.wait_for_replication(_servers_, self.cluster)
        self.verify_cluster_stats(_servers_, self.master)
Ejemplo n.º 44
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            # let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    api = rest.baseUrl + 'nodeStatuses'
                    status, content, header = rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                # try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("60 seconds sleep after failover before invoking rebalance...")
            time.sleep(60)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            if self.during_ops:
                self.sleep(5, "Wait for some progress in rebalance")
                if self.during_ops == "change_password":
                    old_pass = self.master.rest_password
                    self.change_password(new_password=self.input.param("new_password", "new_pass"))
                    rest = RestConnection(self.master)
                elif self.during_ops == "change_port":
                    self.change_port(new_port=self.input.param("new_port", "9090"))
                    rest = RestConnection(self.master)
            try:
                msg = "rebalance failed while removing failover nodes {0}".format(chosen)
                self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
                for failed in chosen:
                    for server in _servers_:
                        if server.ip == failed.ip:
                             _servers_.remove(server)
                             self._cleanup_nodes.append(server)

                log.info("Begin VERIFICATION ...")
                RebalanceHelper.wait_for_replication(_servers_, self.cluster)
                self.verify_cluster_stats(_servers_, self.master)
            finally:
                if self.during_ops:
                     if self.during_ops == "change_password":
                         self.change_password(new_password=old_pass)
                     elif self.during_ops == "change_port":
                         self.change_port(new_port='8091',
                                          current_port=self.input.param("new_port", "9090"))
Ejemplo n.º 45
0
    def replicate_correct_data_after_rollback(self):
        '''
        @attention:
          This test case has some issue with docker runs.
          It passes without any issue on VMs.
        '''

        bucket = self.bucket_util.buckets[0]
        cluster = self.cluster

        gen_load = doc_generator(self.key, 0, self.num_items)
        for bucket in self.bucket_util.buckets:
            task = self.task.async_load_gen_docs(
                self.cluster, bucket, gen_load, "create", 0,
                batch_size=10, process_concurrency=8,
                replicate_to=self.replicate_to, persist_to=self.persist_to,
                timeout_secs=self.sdk_timeout, retries=self.sdk_retries)
            self.task.jython_task_manager.get_task_result(task)

        # store the KVs which were modified and active on node 1
        modified_kvs_active_on_node1 = dict()
        vbucket_client = VBucketAwareMemcached(
            RestConnection(cluster.master), bucket.name)
        client = MemcachedClientHelper.direct_client(cluster.servers[0],
                                                     bucket.name)
        for i in range(self.num_items/100):
            keyname = 'keyname-' + str(i)
            vbId = self.bucket_util.get_vbucket_num_for_key(keyname,
                                                            self.vbuckets)
            if vbucket_client.vBucketMap[vbId].split(':')[0] == cluster.servers[0].ip:
                rc = client.get(keyname)
                modified_kvs_active_on_node1[keyname] = rc[2]

        # Stop persistence
        for server in cluster.servers[:self.nodes_init]:
            # Create cbepctl command object
            node_shell_conn = RemoteMachineShellConnection(server)
            cbepctl_obj = Cbepctl(node_shell_conn)

            for bucket in self.bucket_util.buckets:
                cbepctl_obj.persistence(bucket.name, "stop")

            # Disconnect the shell_connection
            node_shell_conn.disconnect()

        # modify less than 1/2 of the keys
        gen_load = doc_generator(self.key, 0, self.num_items/100)
        rc = self.cluster.load_gen_docs(
            cluster.servers[0], bucket.name, gen_load,
            bucket.kvs[1], "create", exp=0, flag=0, batch_size=10,
            compression=self.sdk_compression)

        # kill memcached, when it comes back because persistence is disabled
        # it will have lost the second set of mutations
        shell = RemoteMachineShellConnection(cluster.servers[0])
        shell.kill_memcached()
        self.sleep(10, "Sleep after kill memcached")

        # Start persistence on the second node
        # Create cbepctl command object
        node_shell_conn = RemoteMachineShellConnection(cluster.servers[1])
        cbepctl_obj = Cbepctl(node_shell_conn)

        for bucket in self.bucket_util.buckets:
            cbepctl_obj.persistence(bucket.name, "start")

        # Disconnect the shell_connection
        node_shell_conn.disconnect()

        self.sleep(10, "Sleep after start persistence")

        # failover to the second node
        rc = self.cluster.failover(cluster.servers, cluster.servers[1:2],
                                   graceful=True)
        self.sleep(30, "Sleep after node failover triggered")

        # Values should be what they were prior to the second update
        client = MemcachedClientHelper.direct_client(
            cluster.servers[0], bucket.name)
        for k, v in modified_kvs_active_on_node1.iteritems():
            rc = client.get(k)
            self.assertTrue(v == rc[2], 'Expected {0}, actual {1}'
                                        .format(v, rc[2]))

        # need to rebalance the node back into the cluster
        # def rebalance(self, servers, to_add, to_remove, timeout=None,
        #               use_hostnames=False, services = None):

        rest_obj = RestConnection(cluster.servers[0])
        nodes_all = rest_obj.node_statuses()
        for node in nodes_all:
            if node.ip == cluster.servers[1].ip:
                break

        node_id_for_recovery = node.id
        status = rest_obj.add_back_node(node_id_for_recovery)
        if status:
            rest_obj.set_recovery_type(node_id_for_recovery,
                                       recoveryType='delta')
        rc = self.cluster.rebalance(cluster.servers[:self.nodes_init], [], [])
Ejemplo n.º 46
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp()
        self.server_map = self.get_server_map(self.servers)

    def tearDown(self):
        super(FailoverTests, self).tearDown()

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats,
               replication, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info(
            "Picking node {0} as reference node for test case".format(
                self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or
                                                  (self.recoveryType != None)):
            self.log.error(
                "Graceful failover can't be applied to nodes with version less then 3.*"
            )
            self.log.error(
                "Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform view creation tasks and wait for completion before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # Take snap-shot of data set used for validaiton
        record_static_data_set = dict()
        prev_vbucket_stats = dict()
        prev_failover_stats = dict()
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(self.servers,
                                                           self.buckets,
                                                           path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(
                self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(
                self.servers, self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # TODO: Enable this even when 'flusher_batch_split_trigger' is not set
        if self.flusher_batch_split_trigger and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(self.master, self.gen_update,
                                                 "update", 0)
            for task in tasks:
                task.result()

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        # Add back + rebalance / only rebalance with verification
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen,
                                                       prev_vbucket_stats,
                                                       record_static_data_set,
                                                       prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        if self.during_ops is None:
            self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list,
                                                  master_node=self.master)

    def run_rebalance_after_failover_and_verify(self, chosen,
                                                prev_vbucket_stats,
                                                record_static_data_set,
                                                prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_,
                                         check_ep_items_remaining=True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.master.rest_password
                self.change_password(
                    new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.master)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.master)
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)
        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}".format(
            [node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Reset password or port
        if self.during_ops:
            if self.during_ops == "change_password":
                self.change_password(new_password=old_pass)
            elif self.during_ops == "change_port":
                self.change_port(new_port='8091',
                                 current_port=self.input.param(
                                     "new_port", "9090"))
            return

        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.verify_cluster_stats(_servers_,
                                  self.master,
                                  check_bucket_stats=True,
                                  check_ep_items_remaining=True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set,
                                   _servers_,
                                   self.buckets,
                                   path=None,
                                   addedItems=None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.compare_failovers_logs(
                prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats = self.compare_vbucket_seqnos(
                prev_vbucket_stats, _servers_, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats,
                                                 new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes,
                                          buckets=self.buckets,
                                          std=20.0,
                                          total_vbuckets=self.total_vbuckets)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats,
                                          record_static_data_set,
                                          prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_,
                                         check_ep_items_remaining=True)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets,
                                                   self.server_map)
        index = 0
        for node in chosen:
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(
                    otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
            else:
                self.rest.add_back_node(node.id)

        # Doc_mutation before triggering rebalance
        tasks = self._async_load_all_buckets(self.master, self.gen_update,
                                             "update", 0)
        for task in tasks:
            task.result()

        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[],
                            deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)

        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info("Start Rebalance Again!")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[],
                                deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}".format(
            chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Drain ep_queue and make sure that intra-cluster replication is complete
        self._wait_for_stats_all_buckets(self.servers,
                                         check_ep_items_remaining=True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers,
                                  self.master,
                                  check_bucket_stats=True,
                                  check_ep_items_remaining=True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, self.server_map, self.buckets,
                                      recoveryTypeMap, fileMapsForVerification,
                                      self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set,
                                   self.servers,
                                   self.buckets,
                                   path=None,
                                   addedItems=None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats,
                                                            self.servers,
                                                            self.buckets,
                                                            perNode=False)
            new_failover_stats = self.compare_failovers_logs(
                prev_failover_stats, self.servers, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats,
                                                 new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes,
                                          buckets=self.buckets,
                                          std=20.0,
                                          total_vbuckets=self.total_vbuckets)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(self.rest).wait_for_node_status(
                        node, "unhealthy", self.wait_timeout * 10),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append(node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(
                    server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(
                    node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info(
                        "node {0}:{1} is 'unhealthy' as expected".format(
                            node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command(
                                    "netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command(
                                    "/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail(
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(
                    node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(
                node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    self.sleep(60)
                    success_failed_over = self.rest.fail_over(
                        node.id,
                        graceful=(self.graceful and graceful_failover))
                    msg = "graceful failover failed for nodes {0}".format(
                        node.id)
                    self.assertTrue(
                        self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(
                        node.id)
                    self.assertTrue(
                        self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(
                failed_over,
                "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(
                failed_over,
                """ Graceful failover should fail due to not enough replicas """
            )
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info(
                "unable to failover the node the first time. try again in  60 seconds.."
            )
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id,
                                              graceful=(self.graceful
                                                        and graceful_failover))
        if self.graceful and (failover_reason
                              not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(
                reached,
                "rebalance failed for Graceful Failover, stuck or did not completed"
            )

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers, chosen)
            self.vb_distribution_analysis(servers=nodes,
                                          buckets=self.buckets,
                                          std=20.0,
                                          total_vbuckets=self.total_vbuckets,
                                          type="failover",
                                          graceful=(self.graceful
                                                    and graceful_failover))

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(self.rest).wait_for_node_status(
                        node, "unhealthy", 300),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append(node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(
                    server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(
                    node, "unhealthy", 300)
                if status:
                    self.log.info(
                        "node {0}:{1} is 'unhealthy' as expected".format(
                            node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command(
                                    "netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command(
                                    "/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail(
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
        nodes = self.filter_servers(self.servers, chosen)
        failed_over = self.cluster.async_failover([self.master],
                                                  failover_nodes=chosen,
                                                  graceful=self.graceful)
        # Perform Compaction
        compact_tasks = []
        if self.compact:
            for bucket in self.buckets:
                compact_tasks.append(
                    self.cluster.async_compact_bucket(self.master, bucket))
        # Run View Operations
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)
        # Run mutation operations
        if self.withMutationOps:
            self.run_mutation_operations()
        failed_over.result()
        for task in compact_tasks:
            task.result()
        msg = "rebalance failed while removing failover nodes {0}".format(
            node.id)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks = []
        tasks += self._async_load_all_buckets(self.master,
                                              self.gen_initial_create,
                                              "create",
                                              0,
                                              flag=2,
                                              batch_size=20000)
        for task in tasks:
            task.result()
        self._wait_for_stats_all_buckets(self.servers,
                                         check_ep_items_remaining=True)
        self._verify_stats_all_buckets(self.servers, timeout=120)

    def run_mutation_operations(self):
        mutation_ops_tasks = []
        if "create" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_create, "create", 0)
        if "update" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
        if "delete" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                task.result()
        except Exception, ex:
            self.log.info(ex)
Ejemplo n.º 47
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp()
        self.server_map = self.get_server_map(self.servers)

    def tearDown(self):
        super(FailoverTests, self).tearDown()

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats,
               replication, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info("Picking node {0} as reference node for test case"
                      .format(self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform view creation tasks and wait for completion before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # Take snap-shot of data set used for validaiton
        record_static_data_set = dict()
        prev_vbucket_stats = dict()
        prev_failover_stats = dict()
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(
                self.servers, self.buckets, path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers,
                                                         self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers,
                                                          self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # TODO: Enable this even when 'flusher_total_batch_limit' is not set
        if self.flusher_total_batch_limit and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
            for task in tasks:
                task.result()

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        # Add back + rebalance // only rebalance with verification
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        if self.during_ops is None:
            self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list,
                                                  master_node=self.master)

    def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.master.rest_password
                self.change_password(new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.master)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.master)
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)
        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Reset password or port
        if self.during_ops:
            if self.during_ops == "change_password":
                self.change_password(new_password=old_pass)
            elif self.during_ops == "change_port":
                self.change_port(new_port='8091',
                current_port=self.input.param("new_port", "9090"))
            return

        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.verify_cluster_stats(_servers_, self.master, check_bucket_stats=True, check_ep_items_remaining=True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map)
        index = 0
        for node in chosen:
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
            else:
                self.rest.add_back_node(node.id)

        # Doc_mutation before triggering rebalance
        if self.flusher_total_batch_limit and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
            for task in tasks:
                task.result()

        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[],
                            deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)

        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info("Start Rebalance Again!")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[],
                                deltaRecoveryBuckets=self.deltaRecoveryBuckets)
            self.sleep(10, "Wait for rebalance to start")

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}".format(chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Drain ep_queue and make sure that intra-cluster replication is complete
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps and self.flusher_total_batch_limit is None:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets, perNode=False)
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats, self.servers, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    self.sleep(120)
                    success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
                    self.sleep(180)
                    msg = "graceful failover failed for nodes {0}".format(node.id)
                    self.log.info("chosen: {0} get_failover_count: {1}".format(len(chosen),
                                                                               self.get_failover_count()))
                    self.assertEqual(len(chosen), self.get_failover_count(), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(failed_over, """ Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers, chosen)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover))

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
        nodes = self.filter_servers(self.servers, chosen)
        failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful)
        # Perform Compaction
        compact_tasks = []
        if self.compact:
            for bucket in self.buckets:
                compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket))
        # Run View Operations
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)
        # Run mutation operations
        if self.withMutationOps:
            self.run_mutation_operations()
        failed_over.result()
        for task in compact_tasks:
            task.result()
        msg = "rebalance failed while removing failover nodes {0}".format(node.id)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)


    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks = []
        tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag=2, batch_size=20000)
        for task in tasks:
            task.result()
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True)
        self._verify_stats_all_buckets(self.servers, timeout=120)

    def run_mutation_operations(self):
        mutation_ops_tasks = []
        if "create" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_create, "create", 0)
        if "update" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
        if "delete" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                task.result()
        except Exception as ex:
            self.log.info(ex)

    def run_mutation_operations_after_failover(self):
        mutation_ops_tasks = []
        if "create" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.afterfailover_gen_create, "create", 0)
        if "update" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.afterfailover_gen_update, "update", 0)
        if "delete" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.afterfailover_gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                task.result()
        except Exception as ex:
            self.log.info(ex)

    def define_maps_during_failover(self, recoveryType=[]):
        """ Method to define nope ip, recovery type map """
        recoveryTypeMap = {}
        index = 0
        for server in self.chosen:
            if recoveryType:
                recoveryTypeMap[server.ip] = recoveryType[index]
            index += 1
        return recoveryTypeMap

    def filter_servers(self, original_servers, filter_servers):
        """ Filter servers that have not failed over """
        _servers_ = copy.deepcopy(original_servers)
        for failed in filter_servers:
            for server in _servers_:
                if server.ip == failed.ip:
                    _servers_.remove(server)
                    self._cleanup_nodes.append(server)
        return _servers_

    def verify_for_recovery_type(self, chosen=[], serverMap={}, buckets=[], recoveryTypeMap={}, fileMap={}, deltaRecoveryBuckets=[]):
        """ Verify recovery type is delta or full """
        summary = ""
        logic = True
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            os_type = shell.extract_remote_info()
            if os_type.type.lower() == 'windows':
                return
            for bucket in buckets:
                path = fileMap[server.ip][bucket.name]
                exists = shell.file_exists(path, "check.txt")
                if deltaRecoveryBuckets != None:
                    if recoveryTypeMap[server.ip] == "delta" and (bucket.name in deltaRecoveryBuckets) and not exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name)
                    elif recoveryTypeMap[server.ip] == "delta" and (bucket.name not in deltaRecoveryBuckets) and exists:
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip, bucket.name)
                        logic = False
                else:
                    if recoveryTypeMap[server.ip] == "delta"  and not exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name)
                    elif recoveryTypeMap[server.ip] == "full" and exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1}  :: Expected Full, Actual Delta".format(server.ip, bucket.name)
            shell.disconnect()
        self.assertTrue(logic, summary)

    def run_view_creation_operations(self, servers):
        """" Run view Creation and indexing building tasks on servers """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        num_tries = self.input.param("num_tries", 10)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        query = {}
        query["connectionTimeout"] = 60000
        query["full_set"] = "true"

        views = []
        tasks = []
        for bucket in self.buckets:
            temp = self.make_default_views(self.default_view_name, num_views,
                                           is_dev_ddoc, different_map=False)
            temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket)
            views += temp
            tasks += temp_tasks

        timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items // 50000)

        for task in tasks:
            task.result(self.wait_timeout * 20)

    def query_and_monitor_view_tasks(self, servers):
        """ Monitor Query Tasks for their completion """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        self.verify_query_task()
        active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False)
        for active_task in active_tasks:
            result = active_task.result()
            self.assertTrue(result)

    def verify_query_task(self):
        """ Verify Query Results """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        query = {}
        query["connectionTimeout"] = 60000
        query["full_set"] = "true"
        expected_rows = None
        timeout = None
        if self.active_resident_threshold == 0:
            timeout = 2400
        if self.max_verify:
            expected_rows = self.max_verify
            query["limit"] = expected_rows
        query["stale"] = "false"
        for bucket in self.buckets:
            self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=timeout, expected_rows=expected_rows)

    def create_file(self, chosen, buckets, serverMap):
        """ Created files in data paths for checking if delta/full recovery occured """
        fileMap = {}
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            type = shell.extract_remote_info().distribution_type
            map = {}
            for bucket in buckets:
                if type.lower() == 'windows':
                    self.data_path = 'c:/Program\ Files/Couchbase/Server/var/lib/couchbase/data'
                bucket_data_path = self.data_path + "/" + bucket.name + "/" + "check.txt"
                full_path = self.data_path + "/" + bucket.name + "/"
                map[bucket.name] = full_path
                shell.create_file(bucket_data_path, "check")
            fileMap[server.ip] = map
            shell.disconnect()
        return fileMap

    def verify_failover_type(self, chosen=None, graceful_count=0, replica_count=0, unreachable=False):
        logic = True
        summary = ""
        nodes = self.rest.node_statuses()
        node_count = len(nodes)
        change_graceful_count = graceful_count
        graceful_failover = True
        if unreachable:
            node_count -= 1
        else:
            change_graceful_count += 1
        if replica_count != 0:
            for node in nodes:
                if unreachable and node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if node.gracefulFailoverPossible:
                        logic = False
                        summary += "\n failover type for unreachable node {0} Expected :: Hard, Actual :: Graceful".format(node.ip)
                elif node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if replica_count > graceful_count and (node_count - 1) + graceful_count >= replica_count:
                        if not node.gracefulFailoverPossible:
                            logic = False
                            summary += "\n failover type for node {0} Expected :: Graceful, Actual :: Hard".format(node.ip)
                    else:
                        if node.gracefulFailoverPossible:
                            logic = False
                            summary += "\n failover type for  {0} Expected :: Hard, Actual :: Graceful".format(node.ip)
        else:
            for node in nodes:
                if node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if node.gracefulFailoverPossible:
                        logic = False
                        summary += "\n failover type for node {0} Expected :: Hard, Actual :: Graceful".format(node.ip)
        self.assertTrue(logic, summary)
        return change_graceful_count, graceful_failover

    def get_server_map(self, node):
        """ Map of ips and server information """
        map = {}
        for server in self.servers:
            map[server.ip] = server
        return map

    def victim_node_operations(self, node=None):
        if self.stopGracefulFailover:
            self.log.info(" Stopping Graceful Failover ")
            stopped = self.rest.stop_rebalance(wait_timeout=self.wait_timeout // 3)
            self.assertTrue(stopped, msg="unable to stop rebalance")
        if self.killNodes:
            self.log.info(" Killing Memcached ")
            kill_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count)
            for kill_node in kill_nodes:
                self.kill_server_memcached(kill_node)
        if self.stopNodes:
            self.log.info(" Stopping Node")
            stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count)
            for stop_node in stop_nodes:
                self.stop_server(stop_node)
            self.sleep(10)
            self.log.info(" Starting Node")
            for start_node in stop_nodes:
                self.start_server(start_node)
        if self.firewallOnNodes:
            self.log.info(" Enabling Firewall for Node ")
            stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count)
            for stop_node in stop_nodes:
                self.start_firewall_on_node(stop_node)
            self.sleep(30)
            self.log.info(" Disable Firewall for Node ")
            for start_node in stop_nodes:
                self.stop_firewall_on_node(start_node)
        self.sleep(120)

    def get_failover_count(self):
        rest = RestConnection(self.master)
        cluster_status = rest.cluster_status()
        failover_count = 0
        # check for inactiveFailed
        for node in cluster_status['nodes']:
            if node['clusterMembership'] == "inactiveFailed":
                failover_count += 1
        return failover_count
class RebalanceProgressTests(RebalanceBaseTest):
    def setUp(self):
        super(RebalanceProgressTests, self).setUp()
        self.rest = RestConnection(self.master)
        self.num_views = self.input.param("num_views", 3)
        if self.num_views:
            self._create_indexes()

    def tearDown(self):
        super(RebalanceProgressTests, self).tearDown()

    def test_progress_rebalance_in(self):
        servers_in = self.servers[self.nodes_init:self.nodes_init +
                                  self.nodes_in]
        servers_init = self.servers[:self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in, [])
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()

        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal and docsTransferred should be 0 in added nodes
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_in,
                              previous_stats,
                              new_stats,
                              "outgoing",
                              docs_total=0,
                              docs_transf=0)
            self._check_stats(servers_in, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_init,
                              previous_stats,
                              new_stats,
                              "ingoing",
                              docs_total=0,
                              docs_transf=0)
            self._check_stats(servers_init, previous_stats, new_stats,
                              "outgoing")
            #sum of sending and receiving vbuckets should coincide
            self._check_vb_sums(servers_init, servers_in, new_stats)
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(10)
        rebalance.result()

    def test_progress_rebalance_out(self):
        with_failover = self.input.param("with_failover", False)
        servers_init = self.servers[:self.nodes_init]
        servers_out = self.servers[(self.nodes_init -
                                    self.nodes_out):self.nodes_init]

        if with_failover:
            self.cluster.failover(servers_init, servers_out)
        rebalance = self.cluster.async_rebalance(servers_init, [], servers_out)
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_init, previous_stats, new_stats,
                              "ingoing")
            self._check_stats(servers_init, previous_stats, new_stats,
                              "outgoing")
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_rebalance_swap(self):
        if self.nodes_in != self.nodes_out:
            self.fail("nodes_in != nodes_out. Not a swap rebalance")
        if len(self.servers) < (self.nodes_init + self.nodes_in):
            self.log.error("Not enough VMs!")
            return
        servers_in = self.servers[self.nodes_init:self.nodes_init +
                                  self.nodes_in]
        servers_init = self.servers[:self.nodes_init]
        servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)]
        servers_out = self.servers[(self.nodes_init -
                                    self.nodes_out):self.nodes_init]

        rebalance = self.cluster.async_rebalance(servers_init, servers_in,
                                                 servers_out)
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal and docsTransferred should be 0 in added nodes
            #no vbuckets moving for unchanged nodes
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_in,
                              previous_stats,
                              new_stats,
                              "outgoing",
                              docs_total=0,
                              docs_transf=0)
            self._check_stats(servers_in, previous_stats, new_stats, "ingoing")
            self._check_stats(servers_unchanged,
                              previous_stats,
                              new_stats,
                              "ingoing",
                              active_vb=0,
                              replica_vb=0)
            self._check_stats(servers_unchanged,
                              previous_stats,
                              new_stats,
                              "outgoing",
                              active_vb=0,
                              replica_vb=0)
            self._check_stats(servers_out, previous_stats, new_stats,
                              "outgoing")

            #sum of sending and receiving vbuckets should coincide
            self._check_vb_sums(servers_in, servers_out, new_stats)
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def test_progress_add_back_after_failover(self):
        servers_init = self.servers[:self.nodes_init]
        servers_failover = self.servers[(self.nodes_init -
                                         self.nodes_out):self.nodes_init]
        servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)]
        nodes_all = self.rest.node_statuses()

        failover_nodes = []
        for failover_server in servers_failover:
            failover_nodes.extend([node for node in nodes_all if node.ip == failover_server.ip and \
                                         str(node.port) == failover_server.port])
        self.cluster.failover(servers_init, servers_failover)
        self.sleep(30)
        for node in failover_nodes:
            self.rest.add_back_node(node.id)

        rebalance = self.cluster.async_rebalance(servers_init, [], [])
        self.sleep(5, "wait for rebalance start")
        previous_stats = self._get_detailed_progress()
        while rebalance.state != "FINISHED":
            new_stats = self._get_detailed_progress()
            if new_stats == {}:
                self.log.info("Got empty progress")
                break
            #vbuckets left should go decreasing
            #docsTotal should not change
            #docsTransferred should go increasing
            self._check_stats(servers_unchanged, previous_stats, new_stats,
                              "outgoing")
            self._check_stats(servers_failover, previous_stats, new_stats,
                              "ingoing")
            previous_stats = copy.deepcopy(new_stats)
            time.sleep(1)
        rebalance.result()

    def _check_vb_sums(self, servers_ingoing, servers_outgoing, new_stats):
        active_vb_sum_1 = sum([
            new_stats[server.ip]["ingoing"]['activeVBucketsLeft']
            for server in servers_ingoing
        ])
        active_vb_sum_2 = sum([
            new_stats[server.ip]["outgoing"]['activeVBucketsLeft']
            for server in servers_outgoing
        ])
        self.assertTrue(
            active_vb_sum_1 == active_vb_sum_2,
            "Active vbuckets left should be equal in servers_in and init. %s" %
            new_stats)

    def _check_stats(self,
                     servers,
                     previous_stats,
                     new_stats,
                     type,
                     docs_total=None,
                     docs_transf=None,
                     active_vb=None,
                     replica_vb=None):
        self.assertTrue(
            new_stats["buckets_count"] == len(self.buckets),
            "Expected buckets %s. Actual stat %s" %
            (len(self.buckets), new_stats))
        for server in servers:
            current_stat = new_stats[server.ip][type]
            previous_stat = previous_stats[server.ip][type]
            if new_stats["bucket"] != previous_stats["bucket"]:
                self.assertTrue(
                    current_stat['activeVBucketsLeft'] >=
                    previous_stat['activeVBucketsLeft'],
                    "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
                self.assertTrue(
                    current_stat['replicaVBucketsLeft'] >=
                    previous_stat['replicaVBucketsLeft'],
                    "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
            else:
                self.assertTrue(
                    current_stat['activeVBucketsLeft'] <=
                    previous_stat['activeVBucketsLeft'],
                    "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
                self.assertTrue(
                    current_stat['replicaVBucketsLeft'] <=
                    previous_stat['replicaVBucketsLeft'],
                    "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
                try:
                    if current_stat['docsTotal'] != previous_stat['docsTotal']:
                        self.log.warning(
                            "docsTotal for node %s changed! Previous stat %s. Actual: %s"
                            % (server.ip, current_stat, previous_stat))
                except Exception as ex:
                    if previous_stat['docsTotal'] != 0 and current_stat[
                            'docsTotal'] == 0:
                        command = "sys:get_status({global, ns_rebalance_observer})."
                        self.log.info("posting: %s" % command)
                        self.rest.diag_eval(command)
                    raise ex
                self.assertTrue(
                    current_stat['docsTransferred'] >=
                    previous_stat['docsTransferred'],
                    "docsTransferred for node %s decreased! Previous stat %s. Actual: %s"
                    % (server.ip, current_stat, previous_stat))
            if docs_total is not None:
                self.assertTrue(
                    current_stat['docsTotal'] == docs_total,
                    "DocTotal for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['docsTotal'], docs_total,
                     current_stat))
            if docs_transf is not None:
                self.assertTrue(
                    current_stat['docsTransferred'] == docs_transf,
                    "docsTransferred for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['docsTotal'], docs_transf,
                     current_stat))
            if active_vb is not None:
                self.assertTrue(
                    current_stat['activeVBucketsLeft'] == active_vb,
                    "docsTransferred for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['activeVBucketsLeft'], active_vb,
                     current_stat))
            if replica_vb is not None:
                self.assertTrue(
                    current_stat['replicaVBucketsLeft'] == replica_vb,
                    "docsTransferred for %s is %s, but should be %s. Stat %s" %
                    (server.ip, current_stat['activeVBucketsLeft'], active_vb,
                     current_stat))
            self.log.info("Checked stat: %s" % new_stats)

    def _get_detailed_progress(self):
        detailed_progress = {}
        tasks = self.rest.ns_server_tasks()
        for task in tasks:
            if "detailedProgress" in task:
                try:
                    if "perNode" in task["detailedProgress"]:
                        nodes = task["detailedProgress"]["perNode"]
                        for node in nodes:
                            detailed_progress[node.split('@')[1]] = nodes[node]
                    detailed_progress["bucket"] = task["detailedProgress"][
                        "bucket"]
                    detailed_progress["buckets_count"] = task[
                        "detailedProgress"]["bucketsCount"]
                    break
                except Exception as ex:
                    self.log.warning("Didn't get statistics %s" % str(ex))
        return detailed_progress

    def _create_indexes(self):
        tasks = []
        views = []
        for bucket in self.buckets:
            temp = self.make_default_views(self.default_view_name,
                                           self.num_views,
                                           False,
                                           different_map=True)
            temp_tasks = self.async_create_views(self.master,
                                                 self.default_view_name, temp,
                                                 bucket)
            tasks += temp_tasks
            views += temp

        timeout = max(
            self.wait_timeout * 4,
            len(self.buckets) * self.wait_timeout * self.num_items // 50000)

        for task in tasks:
            task.result(timeout)

        for bucket in self.buckets:
            for view in views:
                # run queries to create indexes
                self.cluster.query_view(self.master, self.default_view_name,
                                        view.name, {
                                            "stale": "false",
                                            "limit": 1000
                                        })