def test_add_remove_graceful_add_back_node_with_cert(self,recovery_type=None): recovery_type = self.input.param('recovery_type') rest = RestConnection(self.master) known_nodes = ['ns_1@'+self.master.ip] progress = None count = 0 servs_inout = self.servers[1:] serv_out = 'ns_1@' + servs_inout[1].ip rest.create_bucket(bucket='default', ramQuotaMB=100) x509main(self.master).setup_master() x509main().setup_cluster_nodes_ssl(servs_inout) for server in servs_inout: rest.add_node('Administrator','password',server.ip) known_nodes.append('ns_1@' + server.ip) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: status = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code") rest.fail_over(serv_out,graceful=True) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") rest.set_recovery_type(serv_out,recovery_type) rest.add_back_node(serv_out) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: status = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code")
def test_failover_add_back(self): try: rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list self._run_initial_index_tasks() failover_task =self.cluster.async_failover([self.master], failover_nodes = servr_out, graceful=self.graceful) failover_task.result() kvOps_tasks = self._run_kvops_tasks() before_index_ops = self._run_before_index_tasks() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: self.log.info(node) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], []) in_between_index_ops = self._run_in_between_tasks() rebalance.result() self.sleep(120) self._run_tasks([kvOps_tasks, before_index_ops, in_between_index_ops]) self._run_after_index_tasks() except Exception, ex: raise
def test_failover_add_back(self): recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list nodes_all = RestConnection(self.master).node_statuses() self.check_and_run_operations(buckets=self.buckets, before=True) failover_task = self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful) self.check_and_run_operations(buckets=self.buckets, in_between=True) failover_task.result() self.log.info(servr_out) rest = RestConnection(self.master) nodes_all = rest.node_statuses() nodes = [] for failover_node in servr_out: nodes.extend([ node for node in nodes_all if node.ip == failover_node.ip or ( node.ip == "127.0.0.1" and str(node.port) != failover_node.port) ]) for node in nodes: rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) self._run_aync_taks() rebalance.result() self.check_and_run_operations(buckets=self.buckets, after=True)
def test_capi_with_failover(self): repl_id = self._start_es_replication() rest_conn = RestConnection(self.src_master) rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true') gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}', xrange(100), start=0, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false') graceful = self._input.param("graceful", False) self.recoveryType = self._input.param("recoveryType", None) self.src_cluster.failover(graceful=graceful) self.sleep(30) if self.recoveryType: server_nodes = rest_conn.node_statuses() for node in server_nodes: if node.ip == self._input.servers[1].ip: rest_conn.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType) self.sleep(30) rest_conn.add_back_node(otpNode=node.id) rebalance = self.cluster.async_rebalance(self.src_cluster.get_nodes(), [], []) rebalance.result() self._verify_es_results()
def test_failover_add_back(self): try: rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list nodes_all = rest.node_statuses() tasks = self.async_run_operations(buckets=self.buckets, phase="before") for task in tasks: task.result() failover_task = self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: self.log.info(node) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], []) self._run_aync_tasks() rebalance.result() self.run_after_operations() except Exception as ex: raise
def test_failover_add_back(self): gen_load = BlobGenerator('buckettest', 'buckettest-', self.value_size, start=0, end=self.num_items) self._load_all_buckets(self.master, gen_load, "create", 0) try: for servers in self.servers: self.secretmgmt_base_obj.setup_pass_node(servers, self.password) self.sleep(30) rest = RestConnection(self.master) self.graceful = self.input.param('graceful', False) recoveryType = self.input.param("recoveryType", "full") self.find_nodes_in_list() self.generate_map_nodes_out_dist() servr_out = self.nodes_out_list nodes_all = rest.node_statuses() failover_task = self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: self.log.info(node) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], []) self.assertTrue(rebalance.result(), "Failover with different servers") except Exception as ex: raise
def test_failover_add_back(self): try: self.run_async_data() rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list nodes_all = rest.node_statuses() failover_task =self.cluster.async_failover([self.master], failover_nodes = servr_out, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: self.log.info(node) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], []) self.run_mutation_operations_for_situational_tests() self.sleep(120, "Wait for rebalance") for t in self.load_thread_list: if t.is_alive(): if t != None: t.signal = False except Exception, ex: raise
def test_add_remove_add_back_node_with_cert(self,rebalance=None): rebalance = self.input.param('rebalance') rest = RestConnection(self.master) servs_inout = self.servers[1:3] serv_out = 'ns_1@' + servs_inout[1].ip known_nodes = ['ns_1@'+self.master.ip] x509main(self.master).setup_master() x509main().setup_cluster_nodes_ssl(servs_inout) for server in servs_inout: rest.add_node('Administrator','password',server.ip) known_nodes.append('ns_1@' + server.ip) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: status = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code") rest.fail_over(serv_out,graceful=False) if (rebalance): rest.rebalance(known_nodes,[serv_out]) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") rest.add_node('Administrator','password',servs_inout[1].ip) else: rest.add_back_node(serv_out) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: response = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code")
def test_add_remove_graceful_add_back_node_with_cert(self,recovery_type=None): recovery_type = self.input.param('recovery_type') rest = RestConnection(self.master) known_nodes = ['ns_1@'+self.master.ip] progress = None count = 0 servs_inout = self.servers[1:] serv_out = 'ns_1@' + servs_inout[1].ip rest.create_bucket(bucket='default', ramQuotaMB=100) x509main(self.master).setup_master() x509main().setup_cluster_nodes_ssl(servs_inout) for server in servs_inout: rest.add_node('Administrator','password',server.ip) known_nodes.append('ns_1@' + server.ip) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: status = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code") rest.fail_over(serv_out,graceful=True) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") rest.set_recovery_type(serv_out,recovery_type) rest.add_back_node(serv_out) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: status = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code")
def test_failover_add_back(self): try: rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list nodes_all = rest.node_statuses() tasks = self.async_check_and_run_operations(buckets=self.buckets, before=True) for task in tasks: task.result() failover_task = self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: self.log.info(node) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance(self.servers[: self.nodes_init], [], []) self._run_aync_tasks() rebalance.result() self.run_after_operations() except Exception, ex: raise
def test_add_remove_add_back_node_with_cert(self,rebalance=None): rebalance = self.input.param('rebalance') rest = RestConnection(self.master) servs_inout = self.servers[1:3] serv_out = 'ns_1@' + servs_inout[1].ip known_nodes = ['ns_1@'+self.master.ip] x509main(self.master).setup_master() x509main().setup_cluster_nodes_ssl(servs_inout) for server in servs_inout: rest.add_node('Administrator','password',server.ip) known_nodes.append('ns_1@' + server.ip) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: status = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code") rest.fail_over(serv_out,graceful=False) if (rebalance): rest.rebalance(known_nodes,[serv_out]) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") rest.add_node('Administrator','password',servs_inout[1].ip) else: rest.add_back_node(serv_out) rest.rebalance(known_nodes) self.assertTrue(self.check_rebalance_complete(rest),"Issue with rebalance") for server in servs_inout: response = x509main(server)._validate_ssl_login() self.assertEqual(status,200,"Not able to login via SSL code")
def test_rename_failover_add_back(self): if len(self.servers) < 2: self.fail("test require more than 1 node") failover_factor = self.input.param("failover-factor", 1) failover_nodes = self.servers[self.nodes_in:self.nodes_in + failover_factor + 1] hostnames = self.rename_nodes(self.servers[:self.nodes_in + failover_factor + 1]) self._set_hostames_to_servers_objs(hostnames) self.verify_referenced_by_names( self.servers[:self.nodes_in + failover_factor + 1], hostnames) self.cluster.rebalance(self.servers[:self.nodes_init], self.servers[self.nodes_init:self.nodes_in + failover_factor + 1], [], use_hostnames=True) rest = RestConnection(self.master) nodes_all = rest.node_statuses() nodes = [] for failover_node in failover_nodes: nodes.extend([ node for node in nodes_all if node.ip == failover_node.hostname and str(node.port) == failover_node.port ]) self.cluster.failover(self.servers, failover_nodes, use_hostnames=True) self.verify_referenced_by_names( self.servers[:self.nodes_in + failover_factor + 1], hostnames) for node in nodes: rest.add_back_node(node.id) self.cluster.rebalance(self.servers[:self.nodes_in + failover_factor + 1], [], [], use_hostnames=True) self.verify_referenced_by_names( self.servers[:self.nodes_in + failover_factor + 1], hostnames)
def test_failover_indexer_add_back(self): """ Indexer add back scenarios :return: """ self._calculate_scan_vector() rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") indexer_out = int(self.input.param("nodes_out", 0)) nodes = self.get_nodes_from_services_map(service_type="index", get_all_nodes=True) self.assertGreaterEqual(len(nodes), indexer_out, "Existing Indexer Nodes less than Indexer out nodes") pre_recovery_tasks = self.async_run_operations(phase="before") self._run_tasks([pre_recovery_tasks]) self._start_disk_writes_for_plasma() kvOps_tasks = self._run_kvops_tasks() try: self.use_replica = False self._create_replica_indexes() servr_out = nodes[:indexer_out] failover_task =self.cluster.async_failover( [self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: log.info("Adding back {0} with recovery type {1}...".format( node.ip, recoveryType)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) log.info("Rebalancing nodes in...") mid_recovery_tasks = self.async_run_operations(phase="in_between") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) rebalance.result() self._run_tasks([mid_recovery_tasks, kvOps_tasks]) #check if the nodes in cluster are healthy msg = "Cluster not in Healthy state" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("==== Cluster in healthy state ====") self._check_all_bucket_items_indexed() post_recovery_tasks = self.async_run_operations(phase="after") self._run_tasks([post_recovery_tasks]) except Exception as ex: log.info(str(ex)) raise
def test_online_upgrade_with_failover(self): upgrade_nodes = self.servers[:self.nodes_init] if self.disable_plasma_upgrade: self._install(self.nodes_in_list, version=self.upgrade_to) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [self.nodes_in_list[0]], [], services=["index"]) rebalance.result() self.disable_upgrade_to_plasma(self.nodes_in_list[0]) for node in upgrade_nodes: node_rest = RestConnection(node) node_info = "{0}:{1}".format(node.ip, node.port) node_services_list = node_rest.get_nodes_services()[node_info] if "index" in node_services_list: self._create_equivalent_indexes(node) failover_task = self.cluster.async_failover([self.master], failover_nodes=[node], graceful=False) failover_task.result() log.info("Node Failed over...") upgrade_th = self._async_update(self.upgrade_to, [node]) for th in upgrade_th: th.join() log.info("==== Upgrade Complete ====") self.sleep(120) rest = RestConnection(self.master) nodes_all = rest.node_statuses() for cluster_node in nodes_all: if cluster_node.ip == node.ip: log.info("Adding Back: {0}".format(node)) rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") log.info("Adding node back to cluster...") active_nodes = [ srvr for srvr in self.servers if srvr.ip != node.ip ] rebalance = self.cluster.async_rebalance(active_nodes, [], []) rebalance.result() self._remove_equivalent_indexes(node) self.sleep(60) msg = "Cluster is not healthy after upgrade" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("Cluster is healthy") if self.initial_version.split("-")[0] in UPGRADE_VERS: self.multi_drop_index() self.sleep(100) self._create_indexes() self.sleep(100) self.assertTrue(self.wait_until_indexes_online(), "Some indexes are not online") log.info("All indexes are online") self._query_index("post_upgrade") self._verify_post_upgrade_results() self._update_int64_dataset() self._query_for_long_num()
def test_failover_indexer_add_back(self): """ Indexer add back scenarios :return: """ rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") indexer_out = int(self.input.param("nodes_out", 0)) nodes = self.get_nodes_from_services_map(service_type="index", get_all_nodes=True) self.assertGreaterEqual(len(nodes), indexer_out, "Existing Indexer Nodes less than Indexer out nodes") pre_recovery_tasks = self.async_run_operations(phase="before") self._run_tasks([pre_recovery_tasks]) self.get_dgm_for_plasma() kvOps_tasks = self._run_kvops_tasks() try: self.use_replica = False self._create_replica_indexes() servr_out = nodes[:indexer_out] failover_task =self.cluster.async_failover( [self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: log.info("Adding back {0} with recovery type {1}...".format( node.ip, recoveryType)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) log.info("Rebalancing nodes in...") mid_recovery_tasks = self.async_run_operations(phase="in_between") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) rebalance.result() self._run_tasks([mid_recovery_tasks, kvOps_tasks]) #check if the nodes in cluster are healthy msg = "Cluster not in Healthy state" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("==== Cluster in healthy state ====") self._check_all_bucket_items_indexed() post_recovery_tasks = self.async_run_operations(phase="after") self._run_tasks([post_recovery_tasks]) except Exception, ex: log.info(str(ex)) raise
def test_online_upgrade_with_failover(self): upgrade_nodes = self.servers[:self.nodes_init] if self.disable_plasma_upgrade: self._install(self.nodes_in_list, version=self.upgrade_to) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [self.nodes_in_list[0]], [], services=["index"]) rebalance.result() self.sleep(100) self.disable_upgrade_to_plasma(self.nodes_in_list[0]) for node in upgrade_nodes: node_rest = RestConnection(node) node_info = "{0}:{1}".format(node.ip, node.port) node_services_list = node_rest.get_nodes_services()[node_info] if "index" in node_services_list: self._create_equivalent_indexes(node) failover_task = self.cluster.async_failover([self.master], failover_nodes=[node], graceful=False) failover_task.result() self.sleep(100) log.info("Node Failed over...") upgrade_th = self._async_update(self.upgrade_to, [node]) for th in upgrade_th: th.join() log.info("==== Upgrade Complete ====") self.sleep(120) rest = RestConnection(self.master) nodes_all = rest.node_statuses() for cluster_node in nodes_all: if cluster_node.ip == node.ip: log.info("Adding Back: {0}".format(node)) rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") log.info("Adding node back to cluster...") active_nodes = [srvr for srvr in self.servers if srvr.ip != node.ip] rebalance = self.cluster.async_rebalance(active_nodes, [], []) rebalance.result() self.sleep(100) self._remove_equivalent_indexes(node) self.sleep(60) msg = "Cluster is not healthy after upgrade" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("Cluster is healthy") self.add_built_in_server_user() self.sleep(20) if self.initial_version.split("-")[0] in UPGRADE_VERS: self.multi_drop_index() self.sleep(100) self._create_indexes() self.sleep(100) self.assertTrue(self.wait_until_indexes_online(), "Some indexes are not online") log.info("All indexes are online") self._query_index("post_upgrade") self._verify_post_upgrade_results() self._update_int64_dataset() self._query_for_long_num()
def test_failover_indexer_add_back(self): """ Indexer add back scenarios :return: """ self._calculate_scan_vector() rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") indexer_out = int(self.input.param("nodes_out", 0)) nodes = self.get_nodes_from_services_map(service_type="index", get_all_nodes=True) self.assertGreaterEqual( len(nodes), indexer_out, "Existing Indexer Nodes less than Indexer out nodes") log.info("Running kv Mutations...") kvOps_tasks = self.kv_mutations() servr_out = nodes[:indexer_out] failover_task = self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful) self._run_tasks([[failover_task], kvOps_tasks]) before_index_ops = self._run_before_index_tasks() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([ node for node in nodes_all if (str(node.port) == failover_node.port) ]) else: for failover_node in servr_out: nodes.extend([ node for node in nodes_all if node.ip == failover_node.ip ]) for node in nodes: log.info("Adding back {0} with recovery type {1}...".format( node.ip, recoveryType)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) log.info("Rebalancing nodes in...") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) log.info("Running KV mutations...") kvOps_tasks = self.kv_mutations() self._run_tasks([[rebalance], kvOps_tasks]) self.sleep(100) self._verify_bucket_count_with_index_count(self.load_query_definitions) self.multi_query_using_index( buckets=self.buckets, query_definitions=self.load_query_definitions)
def test_indexer_failover_add_back(self): rest = RestConnection(self.master) self.generate_map_nodes_out_dist() index_names_defn = self._create_array_index_definitions() try: failover_task = self.cluster.async_failover( [self.master], failover_nodes=self.nodes_out_list, graceful=self.graceful) failover_task.result() nodes_all = rest.node_statuses() nodes = [] if self.nodes_out_list[0].ip == "127.0.0.1": for failover_node in self.nodes_out_list: nodes.extend([ node for node in nodes_all if (str(node.port) == failover_node.port) ]) else: for failover_node in self.nodes_out_list: nodes.extend([ node for node in nodes_all if node.ip == failover_node.ip ]) for node in nodes: log.info( "Adding back {0} with recovery type Full...".format( node.ip)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType="full") log.info("Rebalancing nodes in...") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) mid_recovery_tasks = threading.Thread( target=self._aggregate_query_using_index, args=(index_names_defn, )) mid_recovery_tasks.start() rebalance.result() mid_recovery_tasks.join() #check if the nodes in cluster are healthy msg = "Cluster not in Healthy state" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("==== Cluster in healthy state ====") self.sleep(60) except Exception, ex: log.info(str(ex)) raise
def test_failover_add_back(self): try: rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list failover_task = self.cluster.async_failover( [self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() pre_recovery_tasks = self.async_run_operations(phase="before") self._run_tasks([pre_recovery_tasks]) self.get_dgm_for_plasma() kvOps_tasks = self._run_kvops_tasks() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([ node for node in nodes_all if (str(node.port) == failover_node.port) ]) else: for failover_node in servr_out: nodes.extend([ node for node in nodes_all if node.ip == failover_node.ip ]) for node in nodes: log.info("Adding Back: {0}".format(node)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) mid_recovery_tasks = self.async_run_operations(phase="in_between") rebalance.result() self._run_tasks([kvOps_tasks, mid_recovery_tasks]) #check if the nodes in cluster are healthy msg = "Cluster not in Healthy state" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("==== Cluster in healthy state ====") self._check_all_bucket_items_indexed() post_recovery_tasks = self.async_run_operations(phase="after") self._run_tasks([post_recovery_tasks]) except Exception as ex: log.info(str(ex)) raise
def test_failover_indexer_add_back(self): """ Indexer add back scenarios :return: """ self._calculate_scan_vector() rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") indexer_out = int(self.input.param("nodes_out", 0)) nodes = self.get_nodes_from_services_map(service_type="index", get_all_nodes=True) self.assertGreaterEqual(len(nodes), indexer_out, "Existing Indexer Nodes less than Indexer out nodes") log.info("Running kv Mutations...") kvOps_tasks = self.kv_mutations() servr_out = nodes[:indexer_out] failover_task =self.cluster.async_failover([self.master], failover_nodes = servr_out, graceful=self.graceful) self._run_tasks([[failover_task], kvOps_tasks]) before_index_ops = self._run_before_index_tasks() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: log.info("Adding back {0} with recovery type {1}...".format(node.ip, recoveryType)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) log.info("Rebalancing nodes in...") rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], []) log.info("Running KV mutations...") kvOps_tasks = self.kv_mutations() self._run_tasks([[rebalance], kvOps_tasks]) self.sleep(100) self._verify_bucket_count_with_index_count(self.load_query_definitions) self.multi_query_using_index(buckets=self.buckets, query_definitions=self.load_query_definitions)
def test_failover_add_back(self): try: rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list failover_task =self.cluster.async_failover([self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() pre_recovery_tasks = self.async_run_operations(phase="before") self._run_tasks([pre_recovery_tasks]) self.get_dgm_for_plasma() kvOps_tasks = self._run_kvops_tasks() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([node for node in nodes_all if (str(node.port) == failover_node.port)]) else: for failover_node in servr_out: nodes.extend([node for node in nodes_all if node.ip == failover_node.ip]) for node in nodes: log.info("Adding Back: {0}".format(node)) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) mid_recovery_tasks = self.async_run_operations(phase="in_between") rebalance.result() self._run_tasks([kvOps_tasks, mid_recovery_tasks]) #check if the nodes in cluster are healthy msg = "Cluster not in Healthy state" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) log.info("==== Cluster in healthy state ====") self._check_all_bucket_items_indexed() post_recovery_tasks = self.async_run_operations(phase="after") self._run_tasks([post_recovery_tasks]) except Exception, ex: log.info(str(ex)) raise
def test_failover_add_back(self): try: rest = RestConnection(self.master) recoveryType = self.input.param("recoveryType", "full") servr_out = self.nodes_out_list self._run_initial_index_tasks() failover_task = self.cluster.async_failover( [self.master], failover_nodes=servr_out, graceful=self.graceful) failover_task.result() kvOps_tasks = self._run_kvops_tasks() before_index_ops = self._run_before_index_tasks() nodes_all = rest.node_statuses() nodes = [] if servr_out[0].ip == "127.0.0.1": for failover_node in servr_out: nodes.extend([ node for node in nodes_all if (str(node.port) == failover_node.port) ]) else: for failover_node in servr_out: nodes.extend([ node for node in nodes_all if node.ip == failover_node.ip ]) for node in nodes: self.log.info(node) rest.add_back_node(node.id) rest.set_recovery_type(otpNode=node.id, recoveryType=recoveryType) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) in_between_index_ops = self._run_in_between_tasks() rebalance.result() self.sleep(120) self._run_tasks( [kvOps_tasks, before_index_ops, in_between_index_ops]) self._run_after_index_tasks() except Exception, ex: raise
def online_upgrade_with_failover(self, upgrade_servers): self.log.info("online upgrade servers: {0}".format(str(upgrade_servers))) for server in upgrade_servers: self.log.info("upgrading: {0}".format(str(server))) participating_servers = [s for s in self.servers] failover_task = self.cluster.async_failover([self.master], failover_nodes=[server], graceful=False) failover_task.result() upgrade_th = self._async_update(self.upgrade_versions[0], [server]) for th in upgrade_th: th.join() rest = RestConnection(self.master) nodes_all = rest.node_statuses() for cluster_node in nodes_all: if cluster_node.ip == server.ip: rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") participating_servers.remove(server) self.log.info("participating servers: {0}".format(str(participating_servers))) rebalance = self.cluster.async_rebalance(participating_servers, [], []) rebalance.result()
def perform_failover(self): rest = RestConnection(self.master) nodes = rest.node_statuses() failover_servers = self.servers[:self. nodes_init][-self.failover_factor:] failover_nodes = [] for server in failover_servers: for node in nodes: if node.ip == server.ip and str(node.port) == server.port: failover_nodes.append(node) for node in failover_nodes: rest.fail_over(node.id) self.sleep(5) if self.failover == GetrTests.FAILOVER_REBALANCE: self.cluster.rebalance(self.servers[:self.nodes_init], [], failover_servers) if self.failover == GetrTests.FAILOVER_ADD_BACK: for node in failover_nodes: rest.add_back_node(node.id) self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
def online_upgrade_with_failover(self, upgrade_servers): self.log.info("online upgrade servers: {0}".format(str(upgrade_servers))) for server in upgrade_servers: self.log.info("upgrading: {0}".format(str(server))) participating_servers = [s for s in self.servers] failover_task = self.cluster.async_failover([self.master], failover_nodes=[server], graceful=False) failover_task.result() upgrade_th = self._async_update(self.upgrade_versions[0], [server]) for th in upgrade_th: th.join() rest = RestConnection(self.master) nodes_all = rest.node_statuses() for cluster_node in nodes_all: if cluster_node.ip == server.ip: rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") participating_servers.remove(server) self.log.info("participating servers: {0}".format(str(participating_servers))) rebalance = self.cluster.async_rebalance(participating_servers, [], []) rebalance.result()
def perform_failover(self): rest = RestConnection(self.master) nodes = rest.node_statuses() failover_servers = self.servers[:self.nodes_init][-self.failover_factor:] failover_nodes = [] for server in failover_servers: for node in nodes: if node.ip == server.ip and str(node.port) == server.port: failover_nodes.append(node) for node in failover_nodes: rest.fail_over(node.id) self.sleep(5) if self.failover == GetrTests.FAILOVER_REBALANCE: self.cluster.rebalance(self.servers[:self.nodes_init], [], failover_servers) if self.failover == GetrTests.FAILOVER_ADD_BACK: for node in failover_nodes: rest.add_back_node(node.id) self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
def test_backwards_compatability_indexes(self): create_index_query = "CREATE INDEX idx_name ON {0}(name)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) self.wait_until_indexes_online() result = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where name = "employee-9"'.format( self.bucket_name)) self.assertEqual(result['metrics']['resultCount'], 72) upgrade_nodes = self.servers[:self.nodes_init] for server in upgrade_nodes: remote = RemoteMachineShellConnection(server) remote.stop_server() remote.disconnect() upgrade_threads = self._async_update(self.upgrade_to, [server]) for upgrade_thread in upgrade_threads: upgrade_thread.join() self.upgrade_servers.append(server) self.sleep(180) msg = "Cluster is not healthy after upgrade" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) self.log.info("Cluster is healthy") rest = RestConnection(self.master) nodes_all = rest.node_statuses() try: for cluster_node in nodes_all: if cluster_node.ip == self.master.ip: self.log.info("Adding Back: {0}".format(self.master.ip)) rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") except Exception as e: self.log.error(str(e)) self.log.info("Adding node back to cluster...") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) rebalance.result() self.assertTrue(self.wait_until_indexes_online(), "Some indexes are not online") self.log.info("All indexes are online") self.add_built_in_server_user() self.sleep(20) try: create_index_query = "CREATE INDEX idx_name ON {0}(name)".format( self.bucket_name) except Exception as e: self.log.info("indexes already exist") try: self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) self.wait_until_indexes_online() except Exception as e: self.log.info("indexes already exist") result = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where name = "employee-9"'.format( self.bucket_name)) self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where join_day = 9'.format( self.bucket_name)) self.assertEqual(result2['metrics']['resultCount'], 72) self.n1ql_helper.create_scope(server=self.master, bucket_name=self.bucket_name, scope_name="test") self.n1ql_helper.create_collection(server=self.master, bucket_name=self.bucket_name, scope_name="test", collection_name="test1") self.n1ql_helper.create_collection(server=self.master, bucket_name=self.bucket_name, scope_name="test", collection_name="test2") self.n1ql_helper.run_cbq_query(query=( 'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) + '(KEY, VALUE) VALUES ("key2", { "type" : "hotel", "name" : "new hotel" })' )) self.n1ql_helper.run_cbq_query(query=( 'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) + '(KEY, VALUE) VALUES ("key1", { "type" : "hotel", "name" : "old hotel" })' )) self.n1ql_helper.run_cbq_query(query=( 'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) + ' (KEY, VALUE) VALUES ("key3", { "nested" : {"fields": "fake"}, "name" : "old hotel" })' )) self.n1ql_helper.run_cbq_query(query=( 'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) + ' (KEY, VALUE) VALUES ("key4", { "numbers": [1,2,3,4] , "name" : "old hotel" })' )) time.sleep(20) self.n1ql_helper.run_cbq_query( query="CREATE INDEX idx1 on default:{0}.test.test1(name) ".format( self.bucket_name)) self.n1ql_helper.run_cbq_query( query="CREATE INDEX idx2 on default:{0}.test.test1(name) ".format( self.bucket_name)) self.n1ql_helper.run_cbq_query( query="CREATE INDEX idx3 on default:{0}.test.test1(nested)".format( self.bucket_name)) self.n1ql_helper.run_cbq_query( query="CREATE INDEX idx4 on default:{0}.test.test1(ALL numbers)". format(self.bucket_name))
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp(self) def tearDown(self): super(FailoverTests, self).tearDown(self) def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case (before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info(" Picking node {0} as reference node for test case".format(self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=1, target_node = self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validaiton record_static_data_set ={} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations relalted to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: return if self.during_ops == None: self.verify_unacked_bytes_all_buckets(filter_list = self.filter_list, master_node = self.master) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.master) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master,bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers,chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node = chosen[0]) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Reset password or port if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090")) return # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.verify_cluster_stats(_servers_, self.master, check_bucket_stats = True, check_ep_items_remaining = True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None, addedItems = None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True) serverMap = self.get_server_map(self.servers) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap) index = 0 for node in chosen: self.rest.add_back_node(node.id) self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master,bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers,chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node = chosen[0]) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets) # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.master, check_bucket_stats = True, check_ep_items_remaining = True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, serverMap, self.buckets,recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set,self.servers, self.buckets, path = None, addedItems = None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False) new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable=True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable=True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) msg = "graceful failover failed for nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse(failed_over,""" Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers,chosen) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets, type = "failover", graceful = (self.graceful and graceful_failover) ) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable=True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable=True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers,chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes = chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master,bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag = 2, batch_size=20000) for task in tasks: task.result() self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True) self._verify_stats_all_buckets(self.servers,timeout = 120) def run_mutation_operations(self): mutation_ops_tasks = [] if("create" in self.doc_ops): mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_create, "create", 0) if("update" in self.doc_ops): mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if("delete" in self.doc_ops): mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception, ex: self.log.info(ex)
class RebalanceProgressTests(RebalanceBaseTest): def setUp(self): super(RebalanceProgressTests, self).setUp() self.rest = RestConnection(self.master) self.num_views = self.input.param("num_views", 3) if self.num_views: self._create_indexes() def tearDown(self): super(RebalanceProgressTests, self).tearDown() def test_progress_rebalance_in(self): servers_in = self.servers[self.nodes_init : self.nodes_init + self.nodes_in] servers_init = self.servers[:self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal and docsTransferred should be 0 in added nodes #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_in, previous_stats, new_stats, "outgoing", docs_total=0, docs_transf=0) self._check_stats(servers_in, previous_stats, new_stats, "ingoing") self._check_stats(servers_init, previous_stats, new_stats, "ingoing", docs_total=0, docs_transf=0) self._check_stats(servers_init, previous_stats, new_stats, "outgoing") #sum of sending and receiving vbuckets should coincide self._check_vb_sums(servers_init, servers_in, new_stats) previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_rebalance_out(self): with_failover = self.input.param("with_failover", False) servers_init = self.servers[:self.nodes_init] servers_out = self.servers[(self.nodes_init - self.nodes_out) : self.nodes_init] if with_failover: self.cluster.failover(servers_init, servers_out) rebalance = self.cluster.async_rebalance(servers_init, [], servers_out) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_init, previous_stats, new_stats, "ingoing") self._check_stats(servers_init, previous_stats, new_stats, "outgoing") previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_rebalance_swap(self): if self.nodes_in != self.nodes_out: self.fail("nodes_in != nodes_out. Not a swap rebalance") if len(self.servers) < (self.nodes_init + self.nodes_in): self.log.error("Not enough VMs!") return servers_in = self.servers[self.nodes_init : self.nodes_init + self.nodes_in] servers_init = self.servers[:self.nodes_init] servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)] servers_out = self.servers[(self.nodes_init - self.nodes_out) : self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal and docsTransferred should be 0 in added nodes #no vbuckets moving for unchanged nodes #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_in, previous_stats, new_stats, "outgoing", docs_total=0, docs_transf=0) self._check_stats(servers_in, previous_stats, new_stats, "ingoing") self._check_stats(servers_unchanged, previous_stats, new_stats, "ingoing", active_vb=0, replica_vb=0) self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing", active_vb=0, replica_vb=0) self._check_stats(servers_out, previous_stats, new_stats, "outgoing") #sum of sending and receiving vbuckets should coincide self._check_vb_sums(servers_in, servers_out, new_stats) previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_add_back_after_failover(self): servers_init = self.servers[:self.nodes_init] servers_failover = self.servers[(self.nodes_init - self.nodes_out) : self.nodes_init] servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)] nodes_all = self.rest.node_statuses() failover_nodes = [] for failover_server in servers_failover: failover_nodes.extend(filter(lambda node: node.ip == failover_server.ip and \ str(node.port) == failover_server.port, nodes_all)) self.cluster.failover(servers_init, servers_failover) for node in failover_nodes: self.rest.add_back_node(node.id) rebalance = self.cluster.async_rebalance(servers_init, [], []) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing") self._check_stats(servers_failover, previous_stats, new_stats, "ingoing") previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def _check_vb_sums(self, servers_ingoing, servers_outgoing, new_stats): active_vb_sum_1 = sum([new_stats[server.ip]["ingoing"]['activeVBucketsLeft'] for server in servers_ingoing]) active_vb_sum_2 = sum([new_stats[server.ip]["outgoing"]['activeVBucketsLeft'] for server in servers_outgoing]) self.assertTrue(active_vb_sum_1 == active_vb_sum_2, "Active vbuckets left should be equal in servers_in and init. %s" % new_stats) def _check_stats(self, servers, previous_stats, new_stats, type, docs_total=None, docs_transf=None, active_vb=None, replica_vb=None): self.assertTrue(new_stats["buckets_count"] == len(self.buckets), "Expected buckets %s. Actual stat %s" %( len(self.buckets), new_stats)) for server in servers: current_stat = new_stats[server.ip][type] previous_stat = previous_stats[server.ip][type] if new_stats["bucket"] != previous_stats["bucket"]: self.assertTrue(current_stat['activeVBucketsLeft'] >= previous_stat['activeVBucketsLeft'], "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %( server.ip, current_stat, previous_stat)) self.assertTrue(current_stat['replicaVBucketsLeft'] >= previous_stat['replicaVBucketsLeft'], "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %( server.ip, current_stat, previous_stat)) else: self.assertTrue(current_stat['activeVBucketsLeft'] <= previous_stat['activeVBucketsLeft'], "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %( server.ip, current_stat, previous_stat)) self.assertTrue(current_stat['replicaVBucketsLeft'] <= previous_stat['replicaVBucketsLeft'], "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" %( server.ip, current_stat, previous_stat)) try: if current_stat['docsTotal'] != previous_stat['docsTotal']: self.log.warn("docsTotal for node %s changed! Previous stat %s. Actual: %s" %( server.ip, current_stat, previous_stat)) except Exception, ex: if previous_stat['docsTotal'] != 0 and current_stat['docsTotal'] == 0: command = "sys:get_status({global, ns_rebalance_observer})." self.log.info("posting: %s" % command) self.rest.diag_eval(command) raise ex self.assertTrue(current_stat['docsTransferred'] >= previous_stat['docsTransferred'], "docsTransferred for node %s decreased! Previous stat %s. Actual: %s" %( server.ip, current_stat, previous_stat)) if docs_total is not None: self.assertTrue(current_stat['docsTotal'] == docs_total, "DocTotal for %s is %s, but should be %s. Stat %s" % ( server.ip, current_stat['docsTotal'], docs_total, current_stat)) if docs_transf is not None: self.assertTrue(current_stat['docsTransferred'] == docs_transf, "docsTransferred for %s is %s, but should be %s. Stat %s" % ( server.ip, current_stat['docsTotal'], docs_transf, current_stat)) if active_vb is not None: self.assertTrue(current_stat['activeVBucketsLeft'] == active_vb, "docsTransferred for %s is %s, but should be %s. Stat %s" % ( server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat)) if replica_vb is not None: self.assertTrue(current_stat['replicaVBucketsLeft'] == replica_vb, "docsTransferred for %s is %s, but should be %s. Stat %s" % ( server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat)) self.log.info("Checked stat: %s" % new_stats)
def test_volume_with_rebalance(self): self.src_bucket = RestConnection(self.master).get_buckets() rest = RestConnection(self.master) bucket = rest.get_buckets() # for bk in bucket: # rest.flush_bucket(bk) #self.sleep(30) #load initial documents self.create_ddocs_and_views() load_thread=[] import Queue queue = Queue.Queue() for b in bucket: load_thread.append(Thread(target=lambda q,args1,args2,args3: q.put(self.load(args1, args2, args3)), args=(queue, self.master, self.num_items, b))) load_thread.append(Thread(target=self.load, args=(self.master, self.num_items,b))) for t in load_thread: t.start() servers_init = self.servers[:self.nodes_init] new_server_list=self.servers[0:self.nodes_init] for t in load_thread: t.join() self.sleep(30) #Reload more data for mutations load_thread=[] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items,b,self.num_items))) for t in load_thread: t.start() # #Rebalance in 1 node self.log.info("==========rebalance in 1 node=========") servers_in=self.servers[self.nodes_init:self.nodes_init + 1] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*2) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b,self.num_items*2))) for t in load_thread: t.start() #rebalance out 1 node new_server_list = self.servers[0:self.nodes_init]+ servers_in self.log.info("==========rebalance out 1 node=========") servers_out=[self.servers[self.nodes_init]] rebalance = self.cluster.async_rebalance(servers_init,[], servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*3) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*3))) for t in load_thread: t.start() new_server_list=list(set(new_server_list)- set(servers_out)) #swap rebalance 1 node self.log.info("==========swap rebalance 1 node=========") servers_in = self.servers[self.nodes_init : self.nodes_init + 1] servers_init = self.servers[:self.nodes_init] servers_out = self.servers[(self.nodes_init - 1) : self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) rebalance.result() for t in load_thread: t.join() self.sleep(30) for b in bucket: self.check_dataloss(self.master, b,self.num_items*4) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*4))) for t in load_thread: t.start() new_server_list=list(set(new_server_list + servers_in) - set(servers_out)) self.log.info("==========Rebalance out of 2 nodes and Rebalance In 1 node=========") # Rebalance out of 2 nodes and Rebalance In 1 node servers_in = [list(set(self.servers) - set(new_server_list))[0]] servers_out = list(set(new_server_list) - set([self.master]))[-2:] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*5) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*5))) for t in load_thread: t.start() new_server_list=list(set(new_server_list + servers_in) - set(servers_out)) self.log.info("==========Rebalance out of 1 nodes and Rebalance In 2 nodes=========") #Rebalance out of 1 nodes and Rebalance In 2 nodes servers_in = list(set(self.servers) - set(new_server_list))[0:2] servers_out = list(set(new_server_list) - set([self.master]))[0:1] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*6) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*6))) for t in load_thread: t.start() new_server_list=list(set(new_server_list + servers_in) - set(servers_out)) self.log.info("==========Rebalance in 4 nodes =========") #Rebalance in 4 nodes servers_in = list(set(self.servers) - set(new_server_list))[0:4] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*7) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*7))) for t in load_thread: t.start() new_server_list=list(set(new_server_list + servers_in)) self.log.info("==========Rebalance out 4 nodes =========") #Rebalance out 4 nodes servers_out = list(set(new_server_list) - set([self.master]))[0:4] rebalance = self.cluster.async_rebalance(servers_init, [], servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*8) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items*8))) for t in load_thread: t.start() new_server_list = list(set(new_server_list) - set(servers_out)) self.log.info("======Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups=========") #Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups servers_in = list(set(self.servers) - set(new_server_list))[0:4] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*9) self.sleep(30) load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 9))) for t in load_thread: t.start() self.shuffle_nodes_between_zones_and_rebalance() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*10) self.sleep(30) load_thread = [] for b in bucket: load_thread.append(Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 10))) for t in load_thread: t.start() self.log.info("======Graceful failover 1 KV node and add back(Delta and Full)=========") #Graceful failover 1 KV node and add back(Delta and Full) kv_server = self.get_nodes_from_services_map(service_type="kv", get_all_nodes=False) fail_over_task = self.cluster.async_failover([self.master], failover_nodes=[kv_server], graceful=True) fail_over_task.result() self.sleep(120) # do a recovery and rebalance rest.set_recovery_type('ns_1@' + kv_server.ip, recoveryType=self.recoveryType) rest.add_back_node('ns_1@' + kv_server.ip) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], [], []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b,self.num_items*11) self.sleep(30)
class GSIAutofailover(AutoFailoverBaseTest, BaseSecondaryIndexingTests): def setUp(self): super(GSIAutofailover, self).setUp() self.log.info( "============== GSIAutofailover setup has started ==============") self.rest.delete_all_buckets() self.index_field_set = powerset([ 'age', 'city', 'country', 'title', 'firstName', 'lastName', 'streetAddress', 'suffix', 'filler1', 'phone', 'zipcode' ]) if self.failover_orchestrator: self.master = self.servers[1] self.rest = RestConnection(self.master) self.log.info( "============== GSIAutofailover setup has completed ==============" ) def tearDown(self): self.log.info( "============== GSIAutofailover tearDown has started ==============" ) super(GSIAutofailover, self).tearDown() self.log.info( "============== GSIAutofailover tearDown has completed ==============" ) def suite_tearDown(self): pass def suite_setUp(self): pass def _create_indexes(self): n1ql_node = self.get_nodes_from_services_map(service_type="n1ql", get_all_nodes=False) for collection_namespace in self.namespaces: for item, index_field in zip(range(self.initial_index_num), self.index_field_set): idx = f'idx_{item}' index_gen = QueryDefinition(index_name=idx, index_fields=index_field) query = index_gen.generate_index_create_query( namespace=collection_namespace, num_replica=self.num_index_replicas) self.run_cbq_query(query=query, server=n1ql_node) def is_failover_expected(self, failure_node_number): failover_not_expected = ( self.max_count == 1 and failure_node_number > 1 and self.pause_between_failover_action < self.timeout or self.num_index_replicas < 1) failover_not_expected = failover_not_expected or ( 1 < self.max_count < failure_node_number and self.pause_between_failover_action < self.timeout or self.num_index_replicas < failure_node_number) return not failover_not_expected def gsi_multi_node_failover(self): servers_to_fail = self.server_to_fail for i in range(self.max_count): self.server_to_fail = [servers_to_fail[i]] self.failover_expected = self.is_failover_expected(i + 1) self.failover_actions[self.failover_action](self) def test_gsi_auto_failover(self): self.bucket_params = self._create_bucket_params( server=self.master, size=self.bucket_size, replicas=self.num_replicas, bucket_type=self.bucket_type, enable_replica_index=self.enable_replica_index, eviction_policy=self.eviction_policy, lww=self.lww) self.cluster.create_standard_bucket(name=self.test_bucket, port=11222, bucket_params=self.bucket_params) self.buckets = self.rest.get_buckets() self.prepare_collection_for_indexing(num_of_docs_per_collection=10**5) self._create_indexes() self.enable_autofailover_and_validate() self.sleep(5) if self.max_count > 1: self.gsi_multi_node_failover() else: self.failover_actions[self.failover_action](self) try: self.disable_autofailover_and_validate() except Exception as err: pass def test_failed_rebalance_with_gsi_autofailover(self): self.bucket_params = self._create_bucket_params( server=self.master, size=self.bucket_size, replicas=self.num_replicas, bucket_type=self.bucket_type, enable_replica_index=self.enable_replica_index, eviction_policy=self.eviction_policy, lww=self.lww) self.cluster.create_standard_bucket(name=self.test_bucket, port=11222, bucket_params=self.bucket_params) self.buckets = self.rest.get_buckets() self.prepare_collection_for_indexing(num_of_docs_per_collection=10**5) self._create_indexes() # enable auto failover self.enable_autofailover_and_validate() # Start rebalance in rebalance_task = self.cluster.async_rebalance( servers=self.servers, to_add=self.servers_to_add, to_remove=self.servers_to_remove, services=['kv', 'index']) self.sleep(20) reached = RestHelper(self.rest).rebalance_reached(percentage=20) self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(20)) # Do a fail over action - reboot, hang, kill. This is defined in the conf file. Test sometimes fail # because the rebalance action is completed fast and there's no way to induce a failure. self.failover_actions[self.failover_action](self) try: rebalance_task.result() except Exception as err: self.log.info("Rebalance failed with : {0}".format(str(err))) if "Rebalance failed. See logs for detailed reason. You can try again" in str( err): self.log.info( "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}" .format(str(err))) elif not RestHelper(self.rest).is_cluster_rebalanced(): if self._auto_failover_message_present_in_logs( self.server_to_fail[0].ip): self.log.info( "Rebalance interrupted due to auto-failover of nodes - message was seen in logs" ) else: self.fail( "Rebalance interrupted message was not seen in logs") else: self.fail("Rebalance was not aborted by auto fail-over") self.disable_autofailover_and_validate() def test_autofailover_and_addback_of_node(self): """ Test autofailover of nodes and then addback of the node after failover 1. Enable autofailover and validate 2. Fail a node and validate if node is failed over if required 3. Addback node and validate that the addback was successful. 4. Failover the same node again. :return: Nothing """ self.bucket_params = self._create_bucket_params( server=self.master, size=self.bucket_size, replicas=self.num_replicas, bucket_type=self.bucket_type, enable_replica_index=self.enable_replica_index, eviction_policy=self.eviction_policy, lww=self.lww) self.cluster.create_standard_bucket(name=self.test_bucket, port=11222, bucket_params=self.bucket_params) self.buckets = self.rest.get_buckets() self.prepare_collection_for_indexing( num_of_docs_per_collection=self.num_of_docs_per_collection) self._create_indexes() self.enable_autofailover_and_validate() self.sleep(5) self.failover_actions[self.failover_action](self) self.bring_back_failed_nodes_up() self.sleep(30) self.log.info(self.server_to_fail[0]) self.nodes = self.rest.node_statuses() self.log.info(self.nodes[0].id) self.rest.add_back_node("ns_1@{}".format(self.server_to_fail[0].ip)) self.rest.set_recovery_type( "ns_1@{}".format(self.server_to_fail[0].ip), self.recovery_strategy) self.rest.rebalance(otpNodes=[node.id for node in self.nodes]) msg = "rebalance failed while recovering failover nodes {0}".format( self.server_to_fail[0]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg) self.failover_actions[self.failover_action](self) try: self.disable_autofailover_and_validate() except Exception as err: pass
def adding_back_a_node(self, master, server): rest = RestConnection(master) nodes = rest.node_statuses() for node in nodes: if server.ip == node.ip and int(server.port) == int(node.port): rest.add_back_node(node.id)
class RebalanceProgressTests(RebalanceBaseTest): def setUp(self): super(RebalanceProgressTests, self).setUp() self.rest = RestConnection(self.master) self.num_views = self.input.param("num_views", 3) if self.num_views: self._create_indexes() def tearDown(self): super(RebalanceProgressTests, self).tearDown() def test_progress_rebalance_in(self): servers_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servers_init = self.servers[:self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal and docsTransferred should be 0 in added nodes #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_in, previous_stats, new_stats, "outgoing", docs_total=0, docs_transf=0) self._check_stats(servers_in, previous_stats, new_stats, "ingoing") self._check_stats(servers_init, previous_stats, new_stats, "ingoing", docs_total=0, docs_transf=0) self._check_stats(servers_init, previous_stats, new_stats, "outgoing") #sum of sending and receiving vbuckets should coincide self._check_vb_sums(servers_init, servers_in, new_stats) previous_stats = copy.deepcopy(new_stats) time.sleep(10) rebalance.result() def test_progress_rebalance_out(self): with_failover = self.input.param("with_failover", False) servers_init = self.servers[:self.nodes_init] servers_out = self.servers[(self.nodes_init - self.nodes_out):self.nodes_init] if with_failover: self.cluster.failover(servers_init, servers_out) rebalance = self.cluster.async_rebalance(servers_init, [], servers_out) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_init, previous_stats, new_stats, "ingoing") self._check_stats(servers_init, previous_stats, new_stats, "outgoing") previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_rebalance_swap(self): if self.nodes_in != self.nodes_out: self.fail("nodes_in != nodes_out. Not a swap rebalance") if len(self.servers) < (self.nodes_init + self.nodes_in): self.log.error("Not enough VMs!") return servers_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servers_init = self.servers[:self.nodes_init] servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)] servers_out = self.servers[(self.nodes_init - self.nodes_out):self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal and docsTransferred should be 0 in added nodes #no vbuckets moving for unchanged nodes #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_in, previous_stats, new_stats, "outgoing", docs_total=0, docs_transf=0) self._check_stats(servers_in, previous_stats, new_stats, "ingoing") self._check_stats(servers_unchanged, previous_stats, new_stats, "ingoing", active_vb=0, replica_vb=0) self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing", active_vb=0, replica_vb=0) self._check_stats(servers_out, previous_stats, new_stats, "outgoing") #sum of sending and receiving vbuckets should coincide self._check_vb_sums(servers_in, servers_out, new_stats) previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_add_back_after_failover(self): servers_init = self.servers[:self.nodes_init] servers_failover = self.servers[(self.nodes_init - self.nodes_out):self.nodes_init] servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)] nodes_all = self.rest.node_statuses() failover_nodes = [] for failover_server in servers_failover: failover_nodes.extend(filter(lambda node: node.ip == failover_server.ip and \ str(node.port) == failover_server.port, nodes_all)) self.cluster.failover(servers_init, servers_failover) self.sleep(30) for node in failover_nodes: self.rest.add_back_node(node.id) rebalance = self.cluster.async_rebalance(servers_init, [], []) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing") self._check_stats(servers_failover, previous_stats, new_stats, "ingoing") previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def _check_vb_sums(self, servers_ingoing, servers_outgoing, new_stats): active_vb_sum_1 = sum([ new_stats[server.ip]["ingoing"]['activeVBucketsLeft'] for server in servers_ingoing ]) active_vb_sum_2 = sum([ new_stats[server.ip]["outgoing"]['activeVBucketsLeft'] for server in servers_outgoing ]) self.assertTrue( active_vb_sum_1 == active_vb_sum_2, "Active vbuckets left should be equal in servers_in and init. %s" % new_stats) def _check_stats(self, servers, previous_stats, new_stats, type, docs_total=None, docs_transf=None, active_vb=None, replica_vb=None): self.assertTrue( new_stats["buckets_count"] == len(self.buckets), "Expected buckets %s. Actual stat %s" % (len(self.buckets), new_stats)) for server in servers: current_stat = new_stats[server.ip][type] previous_stat = previous_stats[server.ip][type] if new_stats["bucket"] != previous_stats["bucket"]: self.assertTrue( current_stat['activeVBucketsLeft'] >= previous_stat['activeVBucketsLeft'], "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) self.assertTrue( current_stat['replicaVBucketsLeft'] >= previous_stat['replicaVBucketsLeft'], "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) else: self.assertTrue( current_stat['activeVBucketsLeft'] <= previous_stat['activeVBucketsLeft'], "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) self.assertTrue( current_stat['replicaVBucketsLeft'] <= previous_stat['replicaVBucketsLeft'], "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) try: if current_stat['docsTotal'] != previous_stat['docsTotal']: self.log.warn( "docsTotal for node %s changed! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) except Exception, ex: if previous_stat['docsTotal'] != 0 and current_stat[ 'docsTotal'] == 0: command = "sys:get_status({global, ns_rebalance_observer})." self.log.info("posting: %s" % command) self.rest.diag_eval(command) raise ex self.assertTrue( current_stat['docsTransferred'] >= previous_stat['docsTransferred'], "docsTransferred for node %s decreased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) if docs_total is not None: self.assertTrue( current_stat['docsTotal'] == docs_total, "DocTotal for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['docsTotal'], docs_total, current_stat)) if docs_transf is not None: self.assertTrue( current_stat['docsTransferred'] == docs_transf, "docsTransferred for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['docsTotal'], docs_transf, current_stat)) if active_vb is not None: self.assertTrue( current_stat['activeVBucketsLeft'] == active_vb, "docsTransferred for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat)) if replica_vb is not None: self.assertTrue( current_stat['replicaVBucketsLeft'] == replica_vb, "docsTransferred for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat)) self.log.info("Checked stat: %s" % new_stats)
def cluster_nodes_write(self,username,password,host,port=8091, servers=None,cluster=None,httpCode=None,user_role=None): try: _cluster_nodes_write = { "ejectNode":"/controller/ejectNode;POST", #"addNode":"/controller/addNode;POST", #"addNodeV2":"/controller/addNodeV2;POST", #"uuidAddNode":"pools/default/serverGroups/<uuid>/addNode;POST", #"uiidAddNodev1":"/pools/default/serverGroups/<uuid>/addNodeV2;POST", #"failover":"/controller/failOver;POST", #"graceFullFailover":"/controller/startGracefulFailover;POST", #"rebalance":"/controller/rebalance;POST", #"reAddNode":"/controller/reAddNode;POST", #"reFailover":"/controller/reFailOver;POST", #"stopRebalance":"/controller/stopRebalance;POST", #"setRecoveryType":"/controller/setRecoveryType;POST" } rest = RestConnection(servers[0]) known_nodes = [] #Add Node params = {'hostname': servers[1].ip,'user': '******','password': '******'} add_node = {"addNode":"controller/addNode;POST;" + str(params)} result = self._return_http_code(add_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) #cluster.rebalance(servers,servers[1],[]) rest.eject_node("Administrator","password",'ns_1@'+servers[1].ip) #cluster.rebalance(servers,[],servers[1:]) #time.sleep(30) #params = {'hostname': servers[1].ip,'user': '******','password': '******'} #add_node = {"addNode":"controller/addNodeV2;POST;" + str(params)} #result = self._return_http_code(add_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) #cluster.rebalance(servers,[],servers[1:]) time.sleep(30) cluster.rebalance(servers,servers[1:],[]) params = {'otpNode': "ns_1@"+servers[1].ip} failover_node = {"failover":"controller/failOver;POST;"+str(params)} result = self._return_http_code(failover_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) time.sleep(30) cluster.rebalance(servers,[],servers[1:]) time.sleep(15) cluster.rebalance(servers,servers[1:],[]) time.sleep(15) params = {'otpNode': "ns_1@"+servers[1].ip} grace_failover = {"grace_failover":"controller/startGracefulFailover;POST;"+str(params)} result = self._return_http_code(grace_failover,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) time.sleep(60) rest.set_recovery_type("ns_1@"+servers[1].ip,'delta') time.sleep(30) rest.add_back_node("ns_1@"+servers[1].ip) time.sleep(30) serv_out = 'ns_1@' + servers[2].ip rest.fail_over(serv_out,graceful=False) time.sleep(15) params = {'otpNode': "ns_1@"+servers[2].ip} radd_node = {"reAddNode":"controller/reAddNode;POST;"+ str(params)} result = self._return_http_code(radd_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) time.sleep(30) #serv_out = 'ns_1@' + servers[3].ip #rest.fail_over(serv_out,graceful=False) #params = {'otpNode': "ns_1@"+servers[3].ip} #radd_node = {"reFailOver":"controller/reFailOver;POST;"+ str(params)} #result = self._return_http_code(radd_node,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) cluster.rebalance(servers,[],servers[1:]) time.sleep(30) cluster.rebalance(servers,servers[1:],[]) time.sleep(30) serv_out = 'ns_1@' + servers[1].ip rest.fail_over(serv_out,graceful=True) time.sleep(60) params = {'otpNode': 'ns_1@'+servers[1].ip,'recoveryType': 'delta'} recovery_type = {"setRecoveryType":"controller/setRecoveryType;POST;"+ str(params)} result = self._return_http_code(recovery_type,username,password,host=host,port=port, httpCode=httpCode, user_role=user_role) cluster.rebalance(servers) except: log.info ("Issue with rebalance, going to next test case") cluster.rebalance(servers,[],servers[1:]) for server in servers: rest = RestConnection(server) rest.init_cluster(username='******', password='******') rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
def adding_back_a_node(self, master, server): rest = RestConnection(master) nodes = rest.node_statuses() for node in nodes: if server.ip == node.ip and int(server.port) == int(node.port): rest.add_back_node(node.id)
def common_test_body(self, keys_count, replica, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(replica)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self._servers) _servers_ = self._servers rest = RestConnection(self.master) nodes = rest.node_statuses() self._wait_for_replication(self._servers, timeout=600) chosen = RebalanceHelper.pick_nodes(self.master, howmany=replica) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info( "10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue( RestHelper(rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall( self._servers, node, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status( node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self._servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.assertTrue( status, msg= "node status is not unhealthy even after waiting for 5 minutes" ) failed_over = rest.fail_over(node.id) if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue( failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") self._wait_for_stats_all_buckets(_servers_) self._wait_for_replication(self._servers, timeout=600) self._verify_stats_all_buckets(_servers_) self._verify_all_buckets(self.master)
def replicate_correct_data_after_rollback(self): NUMBER_OF_DOCS = 10000 # populate the kvs, they will look like ... """ key: keyname-x value: { "mutated": 0, "_id": "keyname-x", "val-field-name": "serial-vals-100" } """ vals = ['serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS)] template = '{{ "val-field-name": "{0}" }}' gen_load = DocumentGenerator('keyname', template, vals, start=0, end=NUMBER_OF_DOCS) rc = self.cluster.load_gen_docs(self.servers[0], self.buckets[0].name, gen_load, self.buckets[0].kvs[1], "create", exp=0, flag=0, batch_size=1000) # store the KVs which were modified and active on node 1 modified_kvs_active_on_node1 = {} vbucket_client = VBucketAwareMemcached(RestConnection(self.master), 'default') client = MemcachedClientHelper.direct_client(self.servers[0], 'default') for i in range(NUMBER_OF_DOCS / 100): keyname = 'keyname-' + str(i) vbId = ((zlib.crc32(keyname) >> 16) & 0x7fff) & (self.vbuckets - 1) if vbucket_client.vBucketMap[vbId].split( ':')[0] == self.servers[0].ip: rc = client.get(keyname) modified_kvs_active_on_node1[keyname] = rc[2] # stop persistence for bucket in self.buckets: for s in self.servers[:self.nodes_init]: client = MemcachedClientHelper.direct_client(s, bucket) try: client.stop_persistence() except MemcachedError as e: if self.bucket_type == 'ephemeral': self.assertTrue( "Memcached error #4 'Invalid': Flusher not running. for vbucket :0 to mc " in e.message) return else: raise # modify less than 1/2 of the keys vals = [ 'modified-serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS / 100) ] template = '{{ "val-field-name": "{0}" }}' gen_load = DocumentGenerator('keyname', template, vals, start=0, end=NUMBER_OF_DOCS / 100) rc = self.cluster.load_gen_docs(self.servers[0], self.buckets[0].name, gen_load, self.buckets[0].kvs[1], "create", exp=0, flag=0, batch_size=1000) # kill memcached, when it comes back because persistence is disabled it will have lost the second set of mutations shell = RemoteMachineShellConnection(self.servers[0]) shell.kill_memcached() time.sleep(10) # start persistence on the second node client = MemcachedClientHelper.direct_client(self.servers[1], 'default') client.start_persistence() time.sleep(5) # failover to the second node rc = self.cluster.failover(self.servers, self.servers[1:2], graceful=True) time.sleep(30) # give time for the failover to complete # check the values, they should be what they were prior to the second update client = MemcachedClientHelper.direct_client(self.servers[0], 'default') for k, v in modified_kvs_active_on_node1.iteritems(): rc = client.get(k) self.assertTrue(v == rc[2], 'Expected {0}, actual {1}'.format(v, rc[2])) # need to rebalance the node back into the cluster # def rebalance(self, servers, to_add, to_remove, timeout=None, use_hostnames=False, services = None): rest_obj = RestConnection(self.servers[0]) node_id_for_recovery = "ns_1@" + self.servers[1].ip status = rest_obj.add_back_node(node_id_for_recovery) if status: rest_obj.set_recovery_type(node_id_for_recovery, recoveryType='delta') rc = self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
def replicate_correct_data_after_rollback(self): ''' @attention: This test case has some issue with docker runs. It passes without any issue on VMs. ''' NUMBER_OF_DOCS = 10000 # populate the kvs, they will look like ... """ key: keyname-x value: { "mutated": 0, "_id": "keyname-x", "val-field-name": "serial-vals-100" } """ vals = ['serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS)] template = '{{ "val-field-name": "{0}" }}' gen_load = DocumentGenerator('keyname', template, vals, start=0, end=NUMBER_OF_DOCS) rc = self.cluster.load_gen_docs(self.servers[0], self.buckets[0].name, gen_load, self.buckets[0].kvs[1], "create", exp=0, flag=0, batch_size=1000) # store the KVs which were modified and active on node 1 modified_kvs_active_on_node1 = {} vbucket_client = VBucketAwareMemcached(RestConnection(self.master), 'default') client = MemcachedClientHelper.direct_client(self.servers[0], 'default') for i in range(NUMBER_OF_DOCS/100): keyname = 'keyname-' + str(i) vbId = ((zlib.crc32(keyname) >> 16) & 0x7fff) & (self.vbuckets- 1) if vbucket_client.vBucketMap[vbId].split(':')[0] == self.servers[0].ip: rc = client.get( keyname ) modified_kvs_active_on_node1[ keyname ] = rc[2] # stop persistence for bucket in self.buckets: for s in self.servers[:self.nodes_init]: client = MemcachedClientHelper.direct_client(s, bucket) try: client.stop_persistence() except MemcachedError as e: if self.bucket_type == 'ephemeral': self.assertTrue( "Memcached error #4 'Invalid': Flusher not running. for vbucket :0 to mc " in e.message) return else: raise # modify less than 1/2 of the keys vals = ['modified-serial-vals-' + str(i) for i in xrange(NUMBER_OF_DOCS/100)] template = '{{ "val-field-name": "{0}" }}' gen_load = DocumentGenerator('keyname', template, vals, start=0, end=NUMBER_OF_DOCS/100) rc = self.cluster.load_gen_docs(self.servers[0], self.buckets[0].name, gen_load, self.buckets[0].kvs[1], "create", exp=0, flag=0, batch_size=1000) # kill memcached, when it comes back because persistence is disabled it will have lost the second set of mutations shell = RemoteMachineShellConnection(self.servers[0]) shell.kill_memcached() time.sleep(10) # start persistence on the second node client = MemcachedClientHelper.direct_client(self.servers[1], 'default') client.start_persistence() time.sleep(5) # failover to the second node rc = self.cluster.failover(self.servers, self.servers[1:2], graceful=True) time.sleep(30) # give time for the failover to complete # check the values, they should be what they were prior to the second update client = MemcachedClientHelper.direct_client(self.servers[0], 'default') for k,v in modified_kvs_active_on_node1.iteritems(): rc = client.get( k ) self.assertTrue( v == rc[2], 'Expected {0}, actual {1}'.format(v, rc[2])) # need to rebalance the node back into the cluster # def rebalance(self, servers, to_add, to_remove, timeout=None, use_hostnames=False, services = None): rest_obj = RestConnection(self.servers[0]) nodes_all = rest_obj.node_statuses() for node in nodes_all: if node.ip == self.servers[1].ip: break node_id_for_recovery = node.id status = rest_obj.add_back_node(node_id_for_recovery) if status: rest_obj.set_recovery_type(node_id_for_recovery, recoveryType='delta') rc = self.cluster.rebalance(self.servers[:self.nodes_init], [],[])
def test_backwards_compatability(self): create_index_query = "CREATE INDEX idx_name ON {0}(name)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) self.wait_until_indexes_online() result = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where name = "employee-9"'.format( self.bucket_name)) self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where join_day = 9'.format( self.bucket_name)) self.assertEqual(result2['metrics']['resultCount'], 72) upgrade_nodes = self.servers[:self.nodes_init] for server in upgrade_nodes: remote = RemoteMachineShellConnection(server) remote.stop_server() remote.disconnect() upgrade_threads = self._async_update(self.upgrade_to, [server]) for upgrade_thread in upgrade_threads: upgrade_thread.join() self.upgrade_servers.append(server) self.sleep(180) msg = "Cluster is not healthy after upgrade" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) self.log.info("Cluster is healthy") rest = RestConnection(self.master) nodes_all = rest.node_statuses() try: for cluster_node in nodes_all: if cluster_node.ip == self.master.ip: self.log.info("Adding Back: {0}".format(self.master.ip)) rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") except Exception as e: self.log.error(str(e)) self.log.info("Adding node back to cluster...") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) rebalance.result() self.assertTrue(self.wait_until_indexes_online(), "Some indexes are not online") self.log.info("All indexes are online") self.add_built_in_server_user() self.sleep(20) try: create_index_query = "CREATE INDEX idx_name ON {0}(name)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) except Exception as e: self.log.info("indexes already exist") try: create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) self.wait_until_indexes_online() except Exception as e: self.log.info("indexes already exist") result = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where name = "employee-9"'.format( self.bucket_name)) self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query( query='SELECT * FROM {0} where join_day = 9'.format( self.bucket_name)) self.assertEqual(result2['metrics']['resultCount'], 72) result = self.n1ql_helper.run_cbq_query( query= 'SELECT * FROM default:{0}._default._default where name = "employee-9"' .format(self.bucket_name)) self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query( query= 'SELECT * FROM default:{0}._default._default where join_day = 9'. format(self.bucket_name)) self.assertEqual(result2['metrics']['resultCount'], 72) result = self.n1ql_helper.run_cbq_query( query='SELECT * FROM _default where name = "employee-9"', query_context='default:{0}._default'.format(self.bucket_name)) self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query( query='SELECT * FROM _default where join_day = 9', query_context='default:{0}._default'.format(self.bucket_name)) self.assertEqual(result2['metrics']['resultCount'], 72)
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp(self) def tearDown(self): super(FailoverTests, self).tearDown(self) def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case (before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.referenceNode = self.master if self.failoverMaster: self.referenceNode = self.servers[1] self.log.info(" Picking node {0} as reference node for test case".format(self.referenceNode.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.referenceNode) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover self.chosen = RebalanceHelper.pick_nodes(self.referenceNode, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withOps = True => Run Operations in parallel to failover # self.withOps = False => Run Operations Before failover self.ops_tasks = self.run_operation_tasks() # Perform View Creation Tasks and check for completion if required before failover if self.runViews: self.run_view_creation_operations(self.servers) if not self.runViewsDuringFailover: self.run_view_creation_operations(self.servers) self.monitor_view_tasks(self.servers) # Take snap-shot of data set used for validaiton record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None) prev_vbucket_stats = {} prev_failover_stats = {} # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations relalted to failover self.run_failover_operations(self.chosen, failover_reason) # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons if not self.gracefulFailoverFail: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 self.sleep(60, "after failover before invoking rebalance...") _servers_ = self.filter_servers(self.servers, chosen) # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.referenceNode.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.referenceNode) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.referenceNode) try: # Run operations if required during rebalance after failover if self.withOps: for task in self.ops_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain Queue and make sure intra-cluster replication is complete self._verify_stats_all_buckets(_servers_,timeout = 120) self._wait_for_stats_all_buckets(_servers_) self.log.info("Begin VERIFICATION for Rebalance after Failover Only") # Verify all data set with meta data if failover happens after failover if not self.withOps: self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None) # Check Cluster Stats and Data as well if max_verify > 0 self.verify_cluster_stats(_servers_, self.referenceNode) # If views were created they can be verified if self.runViews: if self.runViewsDuringFailover: self.monitor_view_tasks(_servers_) self.verify_query_task() # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) self.log.info("End VERIFICATION for Rebalance after Failover Only") finally: if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090")) def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ serverMap = self.get_server_map(self.servers) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap) index = 0 for node in chosen: self.rest.add_back_node(node.id) self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) # Run operations if required during rebalance after failover if self.withOps: for task in self.ops_tasks: task.result() self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._verify_stats_all_buckets(self.servers,timeout = 120) self._wait_for_stats_all_buckets(self.servers) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, serverMap, self.buckets, recoveryTypeMap, fileMapsForVerification) # Comparison of all data if required if not self.withOps: self.data_analysis_all(record_static_data_set,self.servers, self.buckets, path = None) # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.referenceNode) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False) new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Peform View Validation if Supported if self.runViews: if self.runViewsDuringFailover: self.monitor_view_tasks(self.servers) self.verify_query_task() self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.referenceNode)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover for node in chosen: if failover_reason == 'stop_server': self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # define precondition check for failover failed_over = self.rest.fail_over(node.id, graceful=self.graceful) # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and failed_over: """ Check if the fail_over fails as expected """ self.assertTrue(not failed_over,""" Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=self.graceful) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") def run_operation_tasks(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.referenceNode, self.gen_initial_create, "create", 0) for task in tasks: task.result() self._verify_stats_all_buckets(self.servers,timeout = 120) self._wait_for_stats_all_buckets(self.servers) # Update or Delete buckets if items > 0 and options are passed in tests # These can run in parallel (withOps = True), or before (withOps = True) ops_tasks = [] if("create" in self.doc_ops): ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "create", 0) if("update" in self.doc_ops): ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "update", 0) if("delete" in self.doc_ops): ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_delete, "delete", 0) if not self.withOps: for task in ops_tasks: task.result() self._wait_for_stats_all_buckets(self.servers) self._verify_stats_all_buckets(self.servers,timeout = 120) return ops_tasks def define_maps_during_failover(self, recoveryType = []): """ Method to define nope ip, recovery type map """ recoveryTypeMap={} index=0 for server in self.chosen: if recoveryType: recoveryTypeMap[server.ip] = recoveryType[index] index += 1 return recoveryTypeMap def filter_servers(self, original_servers, filter_servers): """ Filter servers that have not failed over """ _servers_ = copy.deepcopy(original_servers) for failed in filter_servers: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) return _servers_ def verify_for_recovery_type(self, chosen = [], serverMap = {}, buckets = [], recoveryTypeMap = {}, fileMap = {}): """ Verify recovery type is delta or full """ logic = True summary = "" for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) for bucket in buckets: path = fileMap[server.ip][bucket.name] exists = shell.file_exists(path,"check.txt") if recoveryTypeMap[server.ip] == "delta" and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip,bucket.name) elif recoveryTypeMap[server.ip] == "full" and exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip,bucket.name) shell.disconnect() self.assertTrue(logic, summary) def run_view_creation_operations(self, servers): """" Run view Creation and indexing building tasks on servers """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) num_tries = self.input.param("num_tries", 10) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000; query["full_set"] = "true" views = [] tasks = [] for bucket in self.buckets: temp = self.make_default_views(self.default_view_name, num_views, is_dev_ddoc, different_map= False) temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket) views += temp tasks += temp_tasks timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items / 50000) for task in tasks: task.result(self.wait_timeout * 20) for bucket in self.buckets: for view in views: # run queries to create indexes self.cluster.query_view(self.master, prefix + ddoc_name, view.name, query) self.verify_query_task() active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = active_task.result() self.assertTrue(result) def monitor_view_tasks(self, servers): """ Monitor Query Tasks for their completion """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = active_task.result() self.assertTrue(result) def verify_query_task(self): """ Verify Query Results """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000; query["full_set"] = "true" expected_rows = None if self.max_verify: expected_rows = self.max_verify query["limit"] = expected_rows query["stale"] = "false" for bucket in self.buckets: self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=2400, expected_rows=expected_rows) def create_file(self,chosen,buckets,serverMap): """ Created files in data paths for checking if delta/full recovery occured """ fileMap={} for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) map = {} for bucket in buckets: bucket_data_path=self.data_path+"/"+bucket.name+"/"+"check.txt" full_path=self.data_path+"/"+bucket.name+"/" map[bucket.name] = full_path shell.create_file(bucket_data_path,"check") fileMap[server.ip] = map shell.disconnect() return fileMap def get_server_map(self,node): """ Map of ips and server information """ map = {} for server in self.servers: map[server.ip] = server return map def stop_server(self, node): """ Method to stop a server which is subject to failover """ for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) if shell.is_couchbase_installed(): shell.stop_couchbase() self.log.info("Couchbase stopped") else: shell.stop_membase() self.log.info("Membase stopped") shell.disconnect() break
def test_backwards_compatability_prepared(self): create_index_query = "CREATE INDEX idx_name ON {0}(name)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) self.wait_until_indexes_online() self.n1ql_helper.run_cbq_query( query='PREPARE p1 as SELECT * FROM {0} where name = "employee-9"'. format(self.bucket_name)) result = self.n1ql_helper.run_cbq_query(query='EXECUTE p1') self.assertEqual(result['metrics']['resultCount'], 72) self.n1ql_helper.run_cbq_query( query='PREPARE p2 as SELECT * FROM {0} where join_day = 9'.format( self.bucket_name)) result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p2') self.assertEqual(result2['metrics']['resultCount'], 72) upgrade_nodes = self.servers[:self.nodes_init] for server in upgrade_nodes: remote = RemoteMachineShellConnection(server) remote.stop_server() remote.disconnect() upgrade_threads = self._async_update(self.upgrade_to, [server]) for upgrade_thread in upgrade_threads: upgrade_thread.join() self.upgrade_servers.append(server) self.sleep(180) msg = "Cluster is not healthy after upgrade" self.assertTrue(self.wait_until_cluster_is_healthy(), msg) self.log.info("Cluster is healthy") rest = RestConnection(self.master) nodes_all = rest.node_statuses() try: for cluster_node in nodes_all: if cluster_node.ip == self.master.ip: self.log.info("Adding Back: {0}".format(self.master.ip)) rest.add_back_node(cluster_node.id) rest.set_recovery_type(otpNode=cluster_node.id, recoveryType="full") except Exception as e: self.log.error(str(e)) self.log.info("Adding node back to cluster...") rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) rebalance.result() self.assertTrue(self.wait_until_indexes_online(), "Some indexes are not online") self.log.info("All indexes are online") self.add_built_in_server_user() self.sleep(20) try: create_index_query = "CREATE INDEX idx_name ON {0}(name)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) except Exception as e: self.log.info("indexes already exist") try: create_index_query = "CREATE INDEX idx_day ON {0}(join_day)".format( self.bucket_name) self.n1ql_helper.run_cbq_query(query=create_index_query, server=self.n1ql_node) self.wait_until_indexes_online() except Exception as e: self.log.info("indexes already exist") # Make sure we are able to create prepared statements after the upgrade on default bucket try: self.n1ql_helper.run_cbq_query( query= 'PREPARE p3 as SELECT * FROM {0}.`_default`.`_default` where name = "employee-9"' .format(self.bucket_name)) except Exception as e: self.log.info( "Let's try prepare again in case could not find scope on first try" ) self.n1ql_helper.run_cbq_query( query= 'PREPARE p3 as SELECT * FROM {0}.`_default`.`_default` where name = "employee-9"' .format(self.bucket_name)) try: self.n1ql_helper.run_cbq_query( query= 'PREPARE p4 as SELECT * FROM {0}.`_default`.`_default` where join_day = 9' .format(self.bucket_name)) except Exception as e: self.log.info( "Let's try prepare again in case could not find scope onf first try" ) self.n1ql_helper.run_cbq_query( query= 'PREPARE p4 as SELECT * FROM {0}.`_default`.`_default` where join_day = 9' .format(self.bucket_name)) result = self.n1ql_helper.run_cbq_query(query='EXECUTE p1') self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p2') self.assertEqual(result2['metrics']['resultCount'], 72) result = self.n1ql_helper.run_cbq_query(query='EXECUTE p3') self.assertEqual(result['metrics']['resultCount'], 72) result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p4') self.assertEqual(result2['metrics']['resultCount'], 72) self.n1ql_helper.create_scope(server=self.master, bucket_name=self.bucket_name, scope_name="test") self.n1ql_helper.create_collection(server=self.master, bucket_name=self.bucket_name, scope_name="test", collection_name="test1") self.n1ql_helper.create_collection(server=self.master, bucket_name=self.bucket_name, scope_name="test", collection_name="test2") self.n1ql_helper.run_cbq_query(query=( 'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) + '(KEY, VALUE) VALUES ("key2", { "type" : "hotel", "name" : "new hotel" })' )) self.n1ql_helper.run_cbq_query(query=( 'INSERT INTO default:{0}.test.test1'.format(self.bucket_name) + '(KEY, VALUE) VALUES ("key1", { "type" : "hotel", "name" : "old hotel" })' )) time.sleep(20) self.n1ql_helper.run_cbq_query( query="CREATE INDEX idx1 on default:{0}.test.test1(name) ".format( self.bucket_name)) time.sleep(20) #Create a prepared statement on a collection and make sure this works post upgrade self.n1ql_helper.run_cbq_query( query= 'PREPARE p5 as SELECT * FROM {0}.test.test1 where name = "new hotel"' .format(self.bucket_name)) result2 = self.n1ql_helper.run_cbq_query(query='EXECUTE p5') self.assertEqual(result2['metrics']['resultCount'], 1)
def test_volume_with_rebalance(self): self.src_bucket = RestConnection(self.master).get_buckets() rest = RestConnection(self.master) bucket = rest.get_buckets() # for bk in bucket: # rest.flush_bucket(bk) #self.sleep(30) #load initial documents self.create_ddocs_and_views() load_thread = [] import Queue queue = Queue.Queue() for b in bucket: load_thread.append( Thread(target=lambda q, args1, args2, args3: q.put( self.load(args1, args2, args3)), args=(queue, self.master, self.num_items, b))) load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b))) for t in load_thread: t.start() servers_init = self.servers[:self.nodes_init] new_server_list = self.servers[0:self.nodes_init] for t in load_thread: t.join() self.sleep(30) #Reload more data for mutations load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items))) for t in load_thread: t.start() # #Rebalance in 1 node self.log.info("==========rebalance in 1 node=========") servers_in = self.servers[self.nodes_init:self.nodes_init + 1] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 2) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 2))) for t in load_thread: t.start() #rebalance out 1 node new_server_list = self.servers[0:self.nodes_init] + servers_in self.log.info("==========rebalance out 1 node=========") servers_out = [self.servers[self.nodes_init]] rebalance = self.cluster.async_rebalance(servers_init, [], servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 3) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 3))) for t in load_thread: t.start() new_server_list = list(set(new_server_list) - set(servers_out)) #swap rebalance 1 node self.log.info("==========swap rebalance 1 node=========") servers_in = self.servers[self.nodes_init:self.nodes_init + 1] servers_init = self.servers[:self.nodes_init] servers_out = self.servers[(self.nodes_init - 1):self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) rebalance.result() for t in load_thread: t.join() self.sleep(30) for b in bucket: self.check_dataloss(self.master, b, self.num_items * 4) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 4))) for t in load_thread: t.start() new_server_list = list( set(new_server_list + servers_in) - set(servers_out)) self.log.info( "==========Rebalance out of 2 nodes and Rebalance In 1 node=========" ) # Rebalance out of 2 nodes and Rebalance In 1 node servers_in = [list(set(self.servers) - set(new_server_list))[0]] servers_out = list(set(new_server_list) - set([self.master]))[-2:] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 5) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 5))) for t in load_thread: t.start() new_server_list = list( set(new_server_list + servers_in) - set(servers_out)) self.log.info( "==========Rebalance out of 1 nodes and Rebalance In 2 nodes=========" ) #Rebalance out of 1 nodes and Rebalance In 2 nodes servers_in = list(set(self.servers) - set(new_server_list))[0:2] servers_out = list(set(new_server_list) - set([self.master]))[0:1] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 6) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 6))) for t in load_thread: t.start() new_server_list = list( set(new_server_list + servers_in) - set(servers_out)) self.log.info("==========Rebalance in 4 nodes =========") #Rebalance in 4 nodes servers_in = list(set(self.servers) - set(new_server_list))[0:4] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 7) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 7))) for t in load_thread: t.start() new_server_list = list(set(new_server_list + servers_in)) self.log.info("==========Rebalance out 4 nodes =========") #Rebalance out 4 nodes servers_out = list(set(new_server_list) - set([self.master]))[0:4] rebalance = self.cluster.async_rebalance(servers_init, [], servers_out) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 8) self.sleep(30) # load more document load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 8))) for t in load_thread: t.start() new_server_list = list(set(new_server_list) - set(servers_out)) self.log.info( "======Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups=========" ) #Rebalance in 4 nodes (8 nodes) wait for rebalance to finish and move between server groups servers_in = list(set(self.servers) - set(new_server_list))[0:4] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 9) self.sleep(30) load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 9))) for t in load_thread: t.start() self.shuffle_nodes_between_zones_and_rebalance() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 10) self.sleep(30) load_thread = [] for b in bucket: load_thread.append( Thread(target=self.load, args=(self.master, self.num_items, b, self.num_items * 10))) for t in load_thread: t.start() self.log.info( "======Graceful failover 1 KV node and add back(Delta and Full)=========" ) #Graceful failover 1 KV node and add back(Delta and Full) kv_server = self.get_nodes_from_services_map(service_type="kv", get_all_nodes=False) fail_over_task = self.cluster.async_failover( [self.master], failover_nodes=[kv_server], graceful=True) fail_over_task.result() self.sleep(120) # do a recovery and rebalance rest.set_recovery_type('ns_1@' + kv_server.ip, recoveryType=self.recoveryType) rest.add_back_node('ns_1@' + kv_server.ip) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], []) rebalance.result() for t in load_thread: t.join() for b in bucket: self.check_dataloss(self.master, b, self.num_items * 11) self.sleep(30)
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall(self.servers, node, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("30 seconds sleep after failover before invoking rebalance...") time.sleep(30) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master)
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: # let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) api = rest.baseUrl + 'nodeStatuses' status, content, header = rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("60 seconds sleep after failover before invoking rebalance...") time.sleep(60) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) rest = RestConnection(self.master) try: msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master) finally: if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090"))
def replicate_correct_data_after_rollback(self): ''' @attention: This test case has some issue with docker runs. It passes without any issue on VMs. ''' bucket = self.bucket_util.buckets[0] cluster = self.cluster gen_load = doc_generator(self.key, 0, self.num_items) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries) self.task.jython_task_manager.get_task_result(task) # store the KVs which were modified and active on node 1 modified_kvs_active_on_node1 = dict() vbucket_client = VBucketAwareMemcached( RestConnection(cluster.master), bucket.name) client = MemcachedClientHelper.direct_client(cluster.servers[0], bucket.name) for i in range(self.num_items/100): keyname = 'keyname-' + str(i) vbId = self.bucket_util.get_vbucket_num_for_key(keyname, self.vbuckets) if vbucket_client.vBucketMap[vbId].split(':')[0] == cluster.servers[0].ip: rc = client.get(keyname) modified_kvs_active_on_node1[keyname] = rc[2] # Stop persistence for server in cluster.servers[:self.nodes_init]: # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(server) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "stop") # Disconnect the shell_connection node_shell_conn.disconnect() # modify less than 1/2 of the keys gen_load = doc_generator(self.key, 0, self.num_items/100) rc = self.cluster.load_gen_docs( cluster.servers[0], bucket.name, gen_load, bucket.kvs[1], "create", exp=0, flag=0, batch_size=10, compression=self.sdk_compression) # kill memcached, when it comes back because persistence is disabled # it will have lost the second set of mutations shell = RemoteMachineShellConnection(cluster.servers[0]) shell.kill_memcached() self.sleep(10, "Sleep after kill memcached") # Start persistence on the second node # Create cbepctl command object node_shell_conn = RemoteMachineShellConnection(cluster.servers[1]) cbepctl_obj = Cbepctl(node_shell_conn) for bucket in self.bucket_util.buckets: cbepctl_obj.persistence(bucket.name, "start") # Disconnect the shell_connection node_shell_conn.disconnect() self.sleep(10, "Sleep after start persistence") # failover to the second node rc = self.cluster.failover(cluster.servers, cluster.servers[1:2], graceful=True) self.sleep(30, "Sleep after node failover triggered") # Values should be what they were prior to the second update client = MemcachedClientHelper.direct_client( cluster.servers[0], bucket.name) for k, v in modified_kvs_active_on_node1.iteritems(): rc = client.get(k) self.assertTrue(v == rc[2], 'Expected {0}, actual {1}' .format(v, rc[2])) # need to rebalance the node back into the cluster # def rebalance(self, servers, to_add, to_remove, timeout=None, # use_hostnames=False, services = None): rest_obj = RestConnection(cluster.servers[0]) nodes_all = rest_obj.node_statuses() for node in nodes_all: if node.ip == cluster.servers[1].ip: break node_id_for_recovery = node.id status = rest_obj.add_back_node(node_id_for_recovery) if status: rest_obj.set_recovery_type(node_id_for_recovery, recoveryType='delta') rc = self.cluster.rebalance(cluster.servers[:self.nodes_init], [], [])
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp() self.server_map = self.get_server_map(self.servers) def tearDown(self): super(FailoverTests, self).tearDown() def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replication, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info( "Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error( "Graceful failover can't be applied to nodes with version less then 3.*" ) self.log.error( "Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform view creation tasks and wait for completion before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Take snap-shot of data set used for validaiton record_static_data_set = dict() prev_vbucket_stats = dict() prev_failover_stats = dict() if not self.withMutationOps: record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # TODO: Enable this even when 'flusher_batch_split_trigger' is not set if self.flusher_batch_split_trigger and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets(self.master, self.gen_update, "update", 0) for task in tasks: task.result() if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Add back + rebalance / only rebalance with verification if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() if self.during_ops is None: self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list, master_node=self.master) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password( new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.master) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}".format( [node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Reset password or port if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param( "new_port", "9090")) return # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.verify_cluster_stats(_servers_, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs( prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos( prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0, total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map) index = 0 for node in chosen: self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type( otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 else: self.rest.add_back_node(node.id) # Doc_mutation before triggering rebalance tasks = self._async_load_all_buckets(self.master, self.gen_update, "update", 0) for task in tasks: task.result() self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info("Start Rebalance Again!") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets, perNode=False) new_failover_stats = self.compare_failovers_logs( prev_failover_stats, self.servers, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0, total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info( "10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue( RestHelper(self.rest).wait_for_node_status( node, "unhealthy", self.wait_timeout * 10), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": unreachable = True self.filter_list.append(node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall( server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status( node, "unhealthy", self.wait_timeout * 10) if status: self.log.info( "node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command( "netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail( "node status is not unhealthy even after waiting for 5 minutes" ) # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type( node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") self.sleep(60) success_failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) msg = "graceful failover failed for nodes {0}".format( node.id) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=True), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format( node.id) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse( failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse( failed_over, """ Graceful failover should fail due to not enough replicas """ ) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue( reached, "rebalance failed for Graceful Failover, stuck or did not completed" ) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers, chosen) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0, total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover)) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info( "10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue( RestHelper(self.rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": unreachable = True self.filter_list.append(node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall( server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status( node, "unhealthy", 300) if status: self.log.info( "node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command( "netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail( "node status is not unhealthy even after waiting for 5 minutes" ) nodes = self.filter_servers(self.servers, chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append( self.cluster.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format( node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag=2, batch_size=20000) for task in tasks: task.result() self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self._verify_stats_all_buckets(self.servers, timeout=120) def run_mutation_operations(self): mutation_ops_tasks = [] if "create" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_create, "create", 0) if "update" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_update, "update", 0) if "delete" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception, ex: self.log.info(ex)
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp() self.server_map = self.get_server_map(self.servers) def tearDown(self): super(FailoverTests, self).tearDown() def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replication, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info("Picking node {0} as reference node for test case" .format(self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform view creation tasks and wait for completion before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Take snap-shot of data set used for validaiton record_static_data_set = dict() prev_vbucket_stats = dict() prev_failover_stats = dict() if not self.withMutationOps: record_static_data_set = self.get_data_set_all( self.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # TODO: Enable this even when 'flusher_total_batch_limit' is not set if self.flusher_total_batch_limit and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets( self.master, self.gen_update, "update", 0) for task in tasks: task.result() if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Add back + rebalance // only rebalance with verification if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() if self.during_ops is None: self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list, master_node=self.master) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.master) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Reset password or port if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090")) return # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.verify_cluster_stats(_servers_, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map) index = 0 for node in chosen: self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 else: self.rest.add_back_node(node.id) # Doc_mutation before triggering rebalance if self.flusher_total_batch_limit and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets( self.master, self.gen_update, "update", 0) for task in tasks: task.result() self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info("Start Rebalance Again!") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) self.sleep(10, "Wait for rebalance to start") # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps and self.flusher_total_batch_limit is None: self.sleep(60) self.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets, perNode=False) new_failover_stats = self.compare_failovers_logs(prev_failover_stats, self.servers, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") self.sleep(120) success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) self.sleep(180) msg = "graceful failover failed for nodes {0}".format(node.id) self.log.info("chosen: {0} get_failover_count: {1}".format(len(chosen), self.get_failover_count())) self.assertEqual(len(chosen), self.get_failover_count(), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse(failed_over, """ Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers, chosen) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover)) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers, chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag=2, batch_size=20000) for task in tasks: task.result() self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self._verify_stats_all_buckets(self.servers, timeout=120) def run_mutation_operations(self): mutation_ops_tasks = [] if "create" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_create, "create", 0) if "update" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_update, "update", 0) if "delete" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception as ex: self.log.info(ex) def run_mutation_operations_after_failover(self): mutation_ops_tasks = [] if "create" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.afterfailover_gen_create, "create", 0) if "update" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.afterfailover_gen_update, "update", 0) if "delete" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.afterfailover_gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception as ex: self.log.info(ex) def define_maps_during_failover(self, recoveryType=[]): """ Method to define nope ip, recovery type map """ recoveryTypeMap = {} index = 0 for server in self.chosen: if recoveryType: recoveryTypeMap[server.ip] = recoveryType[index] index += 1 return recoveryTypeMap def filter_servers(self, original_servers, filter_servers): """ Filter servers that have not failed over """ _servers_ = copy.deepcopy(original_servers) for failed in filter_servers: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) return _servers_ def verify_for_recovery_type(self, chosen=[], serverMap={}, buckets=[], recoveryTypeMap={}, fileMap={}, deltaRecoveryBuckets=[]): """ Verify recovery type is delta or full """ summary = "" logic = True for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) os_type = shell.extract_remote_info() if os_type.type.lower() == 'windows': return for bucket in buckets: path = fileMap[server.ip][bucket.name] exists = shell.file_exists(path, "check.txt") if deltaRecoveryBuckets != None: if recoveryTypeMap[server.ip] == "delta" and (bucket.name in deltaRecoveryBuckets) and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name) elif recoveryTypeMap[server.ip] == "delta" and (bucket.name not in deltaRecoveryBuckets) and exists: summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip, bucket.name) logic = False else: if recoveryTypeMap[server.ip] == "delta" and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name) elif recoveryTypeMap[server.ip] == "full" and exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip, bucket.name) shell.disconnect() self.assertTrue(logic, summary) def run_view_creation_operations(self, servers): """" Run view Creation and indexing building tasks on servers """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) num_tries = self.input.param("num_tries", 10) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000 query["full_set"] = "true" views = [] tasks = [] for bucket in self.buckets: temp = self.make_default_views(self.default_view_name, num_views, is_dev_ddoc, different_map=False) temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket) views += temp tasks += temp_tasks timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items // 50000) for task in tasks: task.result(self.wait_timeout * 20) def query_and_monitor_view_tasks(self, servers): """ Monitor Query Tasks for their completion """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] self.verify_query_task() active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = active_task.result() self.assertTrue(result) def verify_query_task(self): """ Verify Query Results """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000 query["full_set"] = "true" expected_rows = None timeout = None if self.active_resident_threshold == 0: timeout = 2400 if self.max_verify: expected_rows = self.max_verify query["limit"] = expected_rows query["stale"] = "false" for bucket in self.buckets: self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=timeout, expected_rows=expected_rows) def create_file(self, chosen, buckets, serverMap): """ Created files in data paths for checking if delta/full recovery occured """ fileMap = {} for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) type = shell.extract_remote_info().distribution_type map = {} for bucket in buckets: if type.lower() == 'windows': self.data_path = 'c:/Program\ Files/Couchbase/Server/var/lib/couchbase/data' bucket_data_path = self.data_path + "/" + bucket.name + "/" + "check.txt" full_path = self.data_path + "/" + bucket.name + "/" map[bucket.name] = full_path shell.create_file(bucket_data_path, "check") fileMap[server.ip] = map shell.disconnect() return fileMap def verify_failover_type(self, chosen=None, graceful_count=0, replica_count=0, unreachable=False): logic = True summary = "" nodes = self.rest.node_statuses() node_count = len(nodes) change_graceful_count = graceful_count graceful_failover = True if unreachable: node_count -= 1 else: change_graceful_count += 1 if replica_count != 0: for node in nodes: if unreachable and node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if node.gracefulFailoverPossible: logic = False summary += "\n failover type for unreachable node {0} Expected :: Hard, Actual :: Graceful".format(node.ip) elif node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if replica_count > graceful_count and (node_count - 1) + graceful_count >= replica_count: if not node.gracefulFailoverPossible: logic = False summary += "\n failover type for node {0} Expected :: Graceful, Actual :: Hard".format(node.ip) else: if node.gracefulFailoverPossible: logic = False summary += "\n failover type for {0} Expected :: Hard, Actual :: Graceful".format(node.ip) else: for node in nodes: if node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if node.gracefulFailoverPossible: logic = False summary += "\n failover type for node {0} Expected :: Hard, Actual :: Graceful".format(node.ip) self.assertTrue(logic, summary) return change_graceful_count, graceful_failover def get_server_map(self, node): """ Map of ips and server information """ map = {} for server in self.servers: map[server.ip] = server return map def victim_node_operations(self, node=None): if self.stopGracefulFailover: self.log.info(" Stopping Graceful Failover ") stopped = self.rest.stop_rebalance(wait_timeout=self.wait_timeout // 3) self.assertTrue(stopped, msg="unable to stop rebalance") if self.killNodes: self.log.info(" Killing Memcached ") kill_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count) for kill_node in kill_nodes: self.kill_server_memcached(kill_node) if self.stopNodes: self.log.info(" Stopping Node") stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count) for stop_node in stop_nodes: self.stop_server(stop_node) self.sleep(10) self.log.info(" Starting Node") for start_node in stop_nodes: self.start_server(start_node) if self.firewallOnNodes: self.log.info(" Enabling Firewall for Node ") stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count) for stop_node in stop_nodes: self.start_firewall_on_node(stop_node) self.sleep(30) self.log.info(" Disable Firewall for Node ") for start_node in stop_nodes: self.stop_firewall_on_node(start_node) self.sleep(120) def get_failover_count(self): rest = RestConnection(self.master) cluster_status = rest.cluster_status() failover_count = 0 # check for inactiveFailed for node in cluster_status['nodes']: if node['clusterMembership'] == "inactiveFailed": failover_count += 1 return failover_count
class RebalanceProgressTests(RebalanceBaseTest): def setUp(self): super(RebalanceProgressTests, self).setUp() self.rest = RestConnection(self.master) self.num_views = self.input.param("num_views", 3) if self.num_views: self._create_indexes() def tearDown(self): super(RebalanceProgressTests, self).tearDown() def test_progress_rebalance_in(self): servers_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servers_init = self.servers[:self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, []) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal and docsTransferred should be 0 in added nodes #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_in, previous_stats, new_stats, "outgoing", docs_total=0, docs_transf=0) self._check_stats(servers_in, previous_stats, new_stats, "ingoing") self._check_stats(servers_init, previous_stats, new_stats, "ingoing", docs_total=0, docs_transf=0) self._check_stats(servers_init, previous_stats, new_stats, "outgoing") #sum of sending and receiving vbuckets should coincide self._check_vb_sums(servers_init, servers_in, new_stats) previous_stats = copy.deepcopy(new_stats) time.sleep(10) rebalance.result() def test_progress_rebalance_out(self): with_failover = self.input.param("with_failover", False) servers_init = self.servers[:self.nodes_init] servers_out = self.servers[(self.nodes_init - self.nodes_out):self.nodes_init] if with_failover: self.cluster.failover(servers_init, servers_out) rebalance = self.cluster.async_rebalance(servers_init, [], servers_out) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_init, previous_stats, new_stats, "ingoing") self._check_stats(servers_init, previous_stats, new_stats, "outgoing") previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_rebalance_swap(self): if self.nodes_in != self.nodes_out: self.fail("nodes_in != nodes_out. Not a swap rebalance") if len(self.servers) < (self.nodes_init + self.nodes_in): self.log.error("Not enough VMs!") return servers_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servers_init = self.servers[:self.nodes_init] servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)] servers_out = self.servers[(self.nodes_init - self.nodes_out):self.nodes_init] rebalance = self.cluster.async_rebalance(servers_init, servers_in, servers_out) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal and docsTransferred should be 0 in added nodes #no vbuckets moving for unchanged nodes #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_in, previous_stats, new_stats, "outgoing", docs_total=0, docs_transf=0) self._check_stats(servers_in, previous_stats, new_stats, "ingoing") self._check_stats(servers_unchanged, previous_stats, new_stats, "ingoing", active_vb=0, replica_vb=0) self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing", active_vb=0, replica_vb=0) self._check_stats(servers_out, previous_stats, new_stats, "outgoing") #sum of sending and receiving vbuckets should coincide self._check_vb_sums(servers_in, servers_out, new_stats) previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def test_progress_add_back_after_failover(self): servers_init = self.servers[:self.nodes_init] servers_failover = self.servers[(self.nodes_init - self.nodes_out):self.nodes_init] servers_unchanged = self.servers[:(self.nodes_init - self.nodes_out)] nodes_all = self.rest.node_statuses() failover_nodes = [] for failover_server in servers_failover: failover_nodes.extend([node for node in nodes_all if node.ip == failover_server.ip and \ str(node.port) == failover_server.port]) self.cluster.failover(servers_init, servers_failover) self.sleep(30) for node in failover_nodes: self.rest.add_back_node(node.id) rebalance = self.cluster.async_rebalance(servers_init, [], []) self.sleep(5, "wait for rebalance start") previous_stats = self._get_detailed_progress() while rebalance.state != "FINISHED": new_stats = self._get_detailed_progress() if new_stats == {}: self.log.info("Got empty progress") break #vbuckets left should go decreasing #docsTotal should not change #docsTransferred should go increasing self._check_stats(servers_unchanged, previous_stats, new_stats, "outgoing") self._check_stats(servers_failover, previous_stats, new_stats, "ingoing") previous_stats = copy.deepcopy(new_stats) time.sleep(1) rebalance.result() def _check_vb_sums(self, servers_ingoing, servers_outgoing, new_stats): active_vb_sum_1 = sum([ new_stats[server.ip]["ingoing"]['activeVBucketsLeft'] for server in servers_ingoing ]) active_vb_sum_2 = sum([ new_stats[server.ip]["outgoing"]['activeVBucketsLeft'] for server in servers_outgoing ]) self.assertTrue( active_vb_sum_1 == active_vb_sum_2, "Active vbuckets left should be equal in servers_in and init. %s" % new_stats) def _check_stats(self, servers, previous_stats, new_stats, type, docs_total=None, docs_transf=None, active_vb=None, replica_vb=None): self.assertTrue( new_stats["buckets_count"] == len(self.buckets), "Expected buckets %s. Actual stat %s" % (len(self.buckets), new_stats)) for server in servers: current_stat = new_stats[server.ip][type] previous_stat = previous_stats[server.ip][type] if new_stats["bucket"] != previous_stats["bucket"]: self.assertTrue( current_stat['activeVBucketsLeft'] >= previous_stat['activeVBucketsLeft'], "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) self.assertTrue( current_stat['replicaVBucketsLeft'] >= previous_stat['replicaVBucketsLeft'], "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) else: self.assertTrue( current_stat['activeVBucketsLeft'] <= previous_stat['activeVBucketsLeft'], "activeVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) self.assertTrue( current_stat['replicaVBucketsLeft'] <= previous_stat['replicaVBucketsLeft'], "replicaVBucketsLeft for node %s increased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) try: if current_stat['docsTotal'] != previous_stat['docsTotal']: self.log.warning( "docsTotal for node %s changed! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) except Exception as ex: if previous_stat['docsTotal'] != 0 and current_stat[ 'docsTotal'] == 0: command = "sys:get_status({global, ns_rebalance_observer})." self.log.info("posting: %s" % command) self.rest.diag_eval(command) raise ex self.assertTrue( current_stat['docsTransferred'] >= previous_stat['docsTransferred'], "docsTransferred for node %s decreased! Previous stat %s. Actual: %s" % (server.ip, current_stat, previous_stat)) if docs_total is not None: self.assertTrue( current_stat['docsTotal'] == docs_total, "DocTotal for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['docsTotal'], docs_total, current_stat)) if docs_transf is not None: self.assertTrue( current_stat['docsTransferred'] == docs_transf, "docsTransferred for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['docsTotal'], docs_transf, current_stat)) if active_vb is not None: self.assertTrue( current_stat['activeVBucketsLeft'] == active_vb, "docsTransferred for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat)) if replica_vb is not None: self.assertTrue( current_stat['replicaVBucketsLeft'] == replica_vb, "docsTransferred for %s is %s, but should be %s. Stat %s" % (server.ip, current_stat['activeVBucketsLeft'], active_vb, current_stat)) self.log.info("Checked stat: %s" % new_stats) def _get_detailed_progress(self): detailed_progress = {} tasks = self.rest.ns_server_tasks() for task in tasks: if "detailedProgress" in task: try: if "perNode" in task["detailedProgress"]: nodes = task["detailedProgress"]["perNode"] for node in nodes: detailed_progress[node.split('@')[1]] = nodes[node] detailed_progress["bucket"] = task["detailedProgress"][ "bucket"] detailed_progress["buckets_count"] = task[ "detailedProgress"]["bucketsCount"] break except Exception as ex: self.log.warning("Didn't get statistics %s" % str(ex)) return detailed_progress def _create_indexes(self): tasks = [] views = [] for bucket in self.buckets: temp = self.make_default_views(self.default_view_name, self.num_views, False, different_map=True) temp_tasks = self.async_create_views(self.master, self.default_view_name, temp, bucket) tasks += temp_tasks views += temp timeout = max( self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items // 50000) for task in tasks: task.result(timeout) for bucket in self.buckets: for view in views: # run queries to create indexes self.cluster.query_view(self.master, self.default_view_name, view.name, { "stale": "false", "limit": 1000 })