def test_system_indexes_rebalance(self): index_names = [] self.collections_helper.create_scope(bucket_name="default", scope_name='scope1') for y in range(0, 10): self.collections_helper.create_collection(bucket_name="default", scope_name='scope1', collection_name="collection" + str(y)) for i in range(0, 10): self.run_cbq_query("CREATE INDEX idx{0} on default:default.scope1.collection{0}(fake) WITH {{'nodes': '{1}:{2}'}}".format(i, self.servers[1].ip, self.servers[1].port)) index_names.append(("idx" + str(i), "collection" + str(i))) # Rebalance in an index node rebalance = self.cluster.async_rebalance(self.servers, [self.servers[2]], [], services=["index"]) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() self.verify_all_indexes(index_names) # Rebalance out an index node rebalance = self.cluster.async_rebalance(self.servers, [], [self.servers[1]]) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() time.sleep(5) self.wait_for_all_indexes_online() self.verify_all_indexes(index_names)
def remove_node(self, otpnode=None, wait_for_rebalance=True, rest=None): """ Method to remove nodes from a cluster. :param otpnode: list of nodes to be removed. :param wait_for_rebalance: boolean, wait for rebalance to finish after removing the nodes. :param rest: RestConnection object """ if not rest: rest = self.rest nodes = rest.node_statuses() '''This is the case when master node is running cbas service as well''' if len(nodes) <= len(otpnode): return helper = RestHelper(rest) try: removed = helper.remove_nodes( knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in otpnode], wait_for_rebalance=wait_for_rebalance) except Exception: self.sleep(5, "Rebalance failed on Removal. Retry.. THIS IS A BUG") removed = helper.remove_nodes( knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in otpnode], wait_for_rebalance=wait_for_rebalance) if wait_for_rebalance: self.assertTrue( removed, "Rebalance operation failed while removing %s" % otpnode)
def terminate(self): if self._xdcr: self._terminate_replications(self._s_master, "cluster1") if self._rdirection == "bidirection": self._terminate_replications(self._d_master, "cluster0") for key in self._clusters_keys_olst: nodes = self._clusters_dic[key] for node in nodes: rest = RestConnection(node) buckets = rest.get_buckets() for bucket in buckets: status = rest.delete_bucket(bucket.name) if status: self._log.info('Deleted bucket : {0} from {1}'.format( bucket.name, node.ip)) rest = RestConnection(nodes[0]) helper = RestHelper(rest) servers = rest.node_statuses() master_id = rest.get_nodes_self().id if len(nodes) > 1: removed = helper.remove_nodes( knownNodes=[node.id for node in servers], ejectedNodes=[ node.id for node in servers if node.id != master_id ], wait_for_rebalance=True)
def test_failure_scenarios_during_recovery_of_node_A(self): self.recovery_type = self.input.param("recovery_type", 'full') # enable auto failover and canAbortRebalance self.enable_autofailover_and_validate() self.sleep(5) # do a graceful failover self.cluster.failover([self.master], failover_nodes=[self.servers[self.server_index_to_fail]], graceful=True) # wait for failover to complete self.wait_for_failover_or_assert(1, 500) # do a delta recovery self.rest.set_recovery_type(otpNode='ns_1@' + self.servers[self.server_index_to_fail].ip, recoveryType=self.recovery_type) # Start rebalance of recovered nodes rebalance_task = self.cluster.async_rebalance(self.servers, [], []) reached = RestHelper(self.rest).rebalance_reached(percentage=30) self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(30)) try: # Do a fail over action - reboot, hang, kill. This is defined in the conf file self.failover_actions[self.failover_action](self) rebalance_task.result() except Exception as ex: self.log.info("Rebalance failed with : {0}".format(str(ex))) if "Rebalance failed. See logs for detailed reason. You can try again" in str(ex): self.log.info( "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}".format( str(ex))) elif not RestHelper(self.rest).is_cluster_rebalanced(): if self._auto_failover_message_present_in_logs(self.server_to_fail[0].ip): self.log.info("Rebalance interrupted due to auto-failover of nodes - message was seen in logs") else: self.fail("Rebalance interrupted message was not seen in logs") else: self.fail("Rebalance was not aborted by auto fail-over") # Reset auto failover settings self.disable_autofailover_and_validate()
def test_failure_scenarios_during_rebalance_in_of_node_A(self): # enable auto failover and canAbortRebalance self.enable_autofailover_and_validate() self.sleep(5) # Start rebalance in rebalance_task = self.cluster.async_rebalance(self.servers, self.servers_to_add, self.servers_to_remove) reached = RestHelper(self.rest).rebalance_reached(percentage=30) self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(30)) try: # Do a fail over action - reboot, hang, kill. This is defined in the conf file self.failover_actions[self.failover_action](self) rebalance_task.result() except Exception as ex: self.log.info("Rebalance failed with : {0}".format(str(ex))) if "Rebalance failed. See logs for detailed reason. You can try again" in str(ex): self.log.info( "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}".format( str(ex))) elif not RestHelper(self.rest).is_cluster_rebalanced(): if self._auto_failover_message_present_in_logs(self.server_to_fail[0].ip): self.log.info("Rebalance interrupted due to auto-failover of nodes - message was seen in logs") else: self.fail("Rebalance interrupted message was not seen in logs") else: self.fail("Rebalance was not aborted by auto fail-over") # Reset auto failover settings self.disable_autofailover_and_validate()
def test_stream_after_warmup(self): nodeA = self.servers[0] bucket = 'standard_bucket' + str(self.standard_buckets - 1) originalVbInfo = self.all_vb_info(nodeA, bucket=bucket) expectedVbSeqno = {} # load all buckets doc_gen = BlobGenerator('dcpdata', 'dcpdata-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, doc_gen, "create", 0) self._wait_for_stats_all_buckets([nodeA]) # store expected vb seqnos originalVbInfo = self.all_vb_info(nodeA, bucket=bucket) # restart node assert self.stop_node(0) time.sleep(5) assert self.start_node(0) rest = RestHelper(RestConnection(nodeA)) assert rest.is_ns_server_running() time.sleep(2) # verify original vbInfo can be streamed dcp_client = self.dcp_client(nodeA, PRODUCER, bucket_name=bucket) for vbucket in originalVbInfo: vb_uuid, _, high_seqno = originalVbInfo[vbucket] stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, vb_uuid) responses = stream.run() assert high_seqno == stream.last_by_seqno
def test_rebalance_in_query_node(self): self.with_retry(lambda: self.ensure_primary_indexes_exist(), eval=None, delay=3, tries=5) self.run_cbq_query( query="PREPARE p1 from select * from default limit 5", server=self.servers[0]) self.sleep(5) for i in range(self.nodes_init): self.run_cbq_query(query="execute p1", server=self.servers[i]) services_in = ["n1ql", "index", "data"] rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [self.servers[self.nodes_init]], [], services=services_in) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() self.sleep(30) try: for i in range(self.nodes_init + 1): self.run_cbq_query(query="execute '[%s:%s]p1'" % (self.servers[0].ip, self.servers[0].port), server=self.servers[i]) finally: rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [], to_remove=[self.servers[self.nodes_init]]) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result()
def test_crash_while_streaming(self): bucket = self.bucket_util.buckets[0] vbucket = randint(0, self.vbuckets) nodeA = self.servers[0] self.load_docs(bucket, vbucket, 0, self.num_items, "create") shell_conn = RemoteMachineShellConnection(nodeA) cb_stat_obj = Cbstats(shell_conn) dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER) _ = dcp_client.stream_req(vbucket, 0, 0, 2 * self.num_items, 0) self.load_docs(nodeA, vbucket, self.num_items) self.assertTrue(self.stop_node(0), msg="Failed during stop_node") self.sleep(2, "Sleep after stop_node") self.assertTrue(self.start_node(0), msg="Failed during start_node") rest = RestHelper(RestConnection(nodeA)) self.assertTrue(rest.is_ns_server_running(), msg="Failed while is_ns_server_running check") self.sleep(30, "Sleep to wait for ns_server to run") vb_info = cb_stat_obj.vbucket_seqno(bucket.name) dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, vb_info[vbucket]["high_seqno"], 0) stream.run() self.assertTrue(stream.last_by_seqno == vb_info[vbucket]["high_seqno"], msg="Mismatch in high_seqno. {0} == {1}".format( vb_info[vbucket]["high_seqno"], stream.last_by_seqno)) # Disconnect shell Connection for the node shell_conn.disconnect()
def test_start_stop_rebalance(self): """ Start-stop rebalance in/out with adding/removing aditional after stopping rebalance. This test begins by loading a given number of items into the cluster. It then add servs_in nodes and remove servs_out nodes and start rebalance. Then rebalance is stopped when its progress reached 20%. After we add extra_nodes_in and remove extra_nodes_out. Restart rebalance with new cluster configuration. Later rebalance will be stop/restart on progress 40/60/80%. After each iteration we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. Once cluster was rebalanced the test is finished. The oder of add/remove nodes looks like: self.nodes_init|servs_in|extra_nodes_in|extra_nodes_out|servs_out """ rest = RestConnection(self.cluster.master) self.bucket_util._wait_for_stats_all_buckets() self.log.info("Current nodes : {0}".format( [node.id for node in rest.node_statuses()])) self.log.info("Adding nodes {0} to cluster".format(self.servs_in)) self.log.info("Removing nodes {0} from cluster".format(self.servs_out)) add_in_once = self.extra_servs_in _ = set(self.servs_init + self.servs_in) - set(self.servs_out) # the latest iteration will be with i=5, for this case rebalance should be completed, # that also is verified and tracked for i in range(1, 6): if i == 1: rebalance = self.task.async_rebalance( self.servs_init[:self.nodes_init], self.servs_in, self.servs_out) else: rebalance = self.task.async_rebalance( self.servs_init[:self.nodes_init] + self.servs_in, add_in_once, self.servs_out + self.extra_servs_out) add_in_once = [] _ = set(self.servs_init + self.servs_in + self.extra_servs_in) \ - set(self.servs_out + self.extra_servs_out) self.sleep(20) expected_progress = 20 * i reached = RestHelper(rest).rebalance_reached(expected_progress) self.assertTrue( reached, "Rebalance failed or did not reach {0}%".format( expected_progress)) if not RestHelper(rest).is_cluster_rebalanced(): self.log.info("Stop the rebalance") stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3) self.assertTrue(stopped, msg="Unable to stop rebalance") self.task_manager.get_task_result(rebalance) if RestHelper(rest).is_cluster_rebalanced(): self.validate_docs() self.log.info( "Rebalance was completed when tried to stop rebalance on {0}%" .format(str(expected_progress))) break else: self.log.info( "Rebalance is still required. Verifying the data in the buckets" ) self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_unacked_bytes_all_buckets()
def test_node_reboot(self): wait_timeout = 120 timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_rename_rebalance_start_stop(self): expected_progress = self.input.param('expected_progress', 30) if len(self.servers) < 2: self.fail("test require more than 1 node") hostnames = self.rename_nodes(self.servers[:self.nodes_in + self.nodes_init]) self._set_hostames_to_servers_objs(hostnames) self.verify_referenced_by_names( self.servers[:self.nodes_in + self.nodes_init], hostnames) rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], self.servers[self.nodes_init:self.nodes_in + self.nodes_init], [], use_hostnames=True) self.sleep(3, 'wait for some progress in rebalance...') rest = RestConnection(self.master) reached = RestHelper(rest).rebalance_reached(expected_progress) self.assertTrue( reached, "rebalance failed or did not reach {0}%".format(expected_progress)) if not RestHelper(rest).is_cluster_rebalanced(): stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout // 3) self.assertTrue(stopped, msg="unable to stop rebalance") self.verify_referenced_by_names( self.servers[:self.nodes_in + self.nodes_init], hostnames) self.cluster.rebalance(self.servers[:self.nodes_init + self.nodes_init], [], [], use_hostnames=True) self.verify_referenced_by_names( self.servers[:self.nodes_in + self.nodes_init], hostnames)
def test_failure_scenarios_during_rebalance_out_of_failedover_node_A(self): # enable auto failover and canAbortRebalance self.enable_autofailover_and_validate() # failover a node self.cluster.failover([self.master], failover_nodes=[self.servers[self.server_index_to_fail]], graceful=False) # wait for failover to complete self.wait_for_failover_or_assert(1, 500) # Start rebalance out rebalance_task = self.cluster.async_rebalance(self.servers, [], [self.servers[self.server_index_to_fail]]) reached = RestHelper(self.rest).rebalance_reached(percentage=30) self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(30)) try: # Do a fail over action - reboot, hang, kill. This is defined in the conf file self.failover_actions[self.failover_action](self) rebalance_task.result() except Exception as ex: self.log.info("Rebalance failed with : {0}".format(str(ex))) if "Rebalance failed. See logs for detailed reason. You can try again" in str(ex): self.fail("Rebalance failed when it was not expected to fail".format(str(ex))) elif not RestHelper(self.rest).is_cluster_rebalanced(): if self._auto_failover_message_present_in_logs(self.server_to_fail[0].ip): self.fail("Rebalance interrupted due to auto-failover of nodes - It was not expected") else: self.log.info("Rebalance was not interrupted as expected") else: self.log.info("Rebalance completes successfully") # Reset auto failover settings self.disable_autofailover_and_validate()
def test_stream_after_warmup(self): nodeA = self.servers[0] bucket = 'standard_bucket'+str(self.standard_buckets-1) originalVbInfo = self.all_vb_info(nodeA, bucket = bucket) expectedVbSeqno = {} # load all buckets doc_gen = BlobGenerator( 'dcpdata', 'dcpdata-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, doc_gen, "create", 0) self._wait_for_stats_all_buckets([nodeA]) # store expected vb seqnos originalVbInfo = self.all_vb_info(nodeA, bucket = bucket) # restart node assert self.stop_node(0) time.sleep(5) assert self.start_node(0) rest = RestHelper(RestConnection(nodeA)) assert rest.is_ns_server_running() time.sleep(2) # verify original vbInfo can be streamed dcp_client = self.dcp_client(nodeA, PRODUCER, auth_user = bucket) for vbucket in originalVbInfo: vb_uuid, _, high_seqno = originalVbInfo[vbucket] stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, vb_uuid) responses = stream.run() assert high_seqno == stream.last_by_seqno
def test_setting_propogation_swap_rebalance(self): expected_curl = self.set_tmpspace() self.assertEqual(expected_curl['queryTmpSpaceSize'], self.tmp_size) expected_dir = self.set_directory() self.assertEqual(expected_dir['queryTmpSpaceDir'], self.directory_path) nodes_out_list = self.servers[1] to_add_nodes = [self.servers[self.nodes_init + 1]] to_remove_nodes = [nodes_out_list] services_in = ["index", "n1ql", "data"] # do a swap rebalance rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], to_add_nodes, [], services=services_in) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init + 1], [], to_remove_nodes) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(5) curl_url = "http://%s:%s/settings/querySettings" % (self.servers[ self.nodes_init + 1].ip, self.servers[self.nodes_init + 1].port) curl_output = self.shell.execute_command( "%s -u Administrator:password %s" % (self.curl_path, curl_url)) expected_curl = self.convert_list_to_json(curl_output[0]) self.assertEqual(expected_curl['queryTmpSpaceSize'], self.tmp_size) self.assertEqual(expected_curl['queryTmpSpaceDir'], self.directory_path)
def test_node_cb_restart(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.restart_couchbase() AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_crash_entire_cluster(self): self.cluster.rebalance( [self.master], self.servers[1:], []) vbucket = 0 nodeA = self.servers[0] n = 10000 self.load_docs(nodeA, vbucket, n) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, 2*n, 0) self.load_docs(nodeA, vbucket, n) # stop all nodes node_range = list(range(len(self.servers))) for i in node_range: assert self.stop_node(i) time.sleep(2) # start all nodes in reverse order node_range.reverse() for i in node_range: assert self.start_node(i) rest = RestHelper(RestConnection(nodeA)) assert rest.is_ns_server_running() _, _, high_seqno = self.vb_info(nodeA, vbucket) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, 0) stream.run() assert stream.last_by_seqno == high_seqno
def test_node_reboot(self): wait_timeout = 120 timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_crash_entire_cluster(self): self.cluster.rebalance( [self.master], self.servers[1:], []) vbucket = 0 nodeA = self.servers[0] n = 10000 self.load_docs(nodeA, vbucket, n) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, 2*n, 0) self.load_docs(nodeA, vbucket, n) # stop all nodes node_range = range(len(self.servers)) for i in node_range: assert self.stop_node(i) time.sleep(2) # start all nodes in reverse order node_range.reverse() for i in node_range: assert self.start_node(i) rest = RestHelper(RestConnection(nodeA)) assert rest.is_ns_server_running() _, _, high_seqno = self.vb_info(nodeA, vbucket) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, 0) stream.run() assert stream.last_by_seqno == high_seqno
def test_permissions(self): shell = RemoteMachineShellConnection(self.master) info = shell.extract_remote_info() if info.type.lower() == 'windows': self.log.info('Test is designed for linux only') return shell.execute_command('chmod 000 %s' % LINUX_CB_PATH) self.sleep(10, 'wait for couchbase stopping') shell.execute_command('chmod 755 %s' % LINUX_CB_PATH) self.sleep(10, 'wait for couchbase start') try: rest = RestConnection(self.master) self.assertTrue( RestHelper(rest).is_ns_server_running(timeout_in_seconds=60), 'NS server is not up') except Exception as ex: self.log.error('Couchbase is not running') shell.execute_command('reboot') self.sleep(60, 'wait for reboot of VM') rest = RestConnection(self.master) self.assertTrue( RestHelper(rest).is_ns_server_running(timeout_in_seconds=60), 'NS server is not up') raise ex finally: shell.disconnect()
def test_query_swap_rebalance(self): self.run_cbq_query(query="PREPARE p1 from select * from default limit 5", server=self.servers[0]) self.sleep(5) for i in range(self.nodes_init): if not self.servers[i] == self.servers[1]: self.run_cbq_query(query="execute p1", server=self.servers[i]) nodes_out_list = self.get_nodes_from_services_map(service_type="index", get_all_nodes=False) to_add_nodes = [self.servers[self.nodes_init + 2]] to_remove_nodes = [nodes_out_list] services_in = ["index", "n1ql", "data"] self.log.info(self.servers[:self.nodes_init]) # do a swap rebalance rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], to_add_nodes, [], services=services_in) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init + 2], [], to_remove_nodes) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() self.sleep(30) for i in range(self.nodes_init): if not self.servers[i] == self.servers[1]: self.run_cbq_query(query="execute '[%s:%s]p1'" % (self.servers[2].ip, self.servers[2].port), server=self.servers[i])
def backup_restore(self): try: backup_start = self.backups[int(self.backupset.start) - 1] except IndexError: backup_start = "{0}{1}".format(self.backups[-1], self.backupset.start) try: backup_end = self.backups[int(self.backupset.end) - 1] except IndexError: backup_end = "{0}{1}".format(self.backups[-1], self.backupset.end) args = "restore --archive {0} --repo {1} --host http://{2}:{3} --username {4} --password {5} --start {6} " \ "--end {7}".format(self.backupset.directory, self.backupset.name, self.backupset.restore_cluster_host.ip, self.backupset.restore_cluster_host.port, self.backupset.restore_cluster_host_username, self.backupset.restore_cluster_host_password, backup_start, backup_end) if self.backupset.exclude_buckets: args += " --exclude-buckets {0}".format(self.backupset.exclude_buckets) if self.backupset.include_buckets: args += " --include-buckets {0}".format(self.backupset.include_buckets) if self.backupset.disable_bucket_config: args += " --disable-bucket-config {0}".format(self.backupset.disable_bucket_config) if self.backupset.disable_views: args += " --disable-views {0}".format(self.backupset.disable_views) if self.backupset.disable_gsi_indexes: args += " --disable-gsi-indexes {0}".format(self.backupset.disable_gsi_indexes) if self.backupset.disable_ft_indexes: args += " --disable-ft-indexes {0}".format(self.backupset.disable_ft_indexes) if self.backupset.disable_data: args += " --disable-data {0}".format(self.backupset.disable_data) if self.backupset.filter_keys: args += " --filter_keys {0}".format(self.backupset.filter_keys) if self.backupset.filter_values: args += " --filter_values {0}".format(self.backupset.filter_values) if self.backupset.force_updates: args += " --force-updates" if self.no_progress_bar: args += " --no-progress-bar" if not self.skip_buckets: rest_conn = RestConnection(self.backupset.restore_cluster_host) rest_helper = RestHelper(rest_conn) for bucket in self.buckets: if not rest_helper.bucket_exists(bucket.name): self.log.info("Creating bucket {0} in restore host {1}".format(bucket.name, self.backupset.restore_cluster_host.ip)) rest_conn.create_bucket(bucket=bucket.name, ramQuotaMB=512, authType=bucket.authType if bucket.authType else 'none', proxyPort=bucket.port, saslPassword=bucket.saslPassword) bucket_ready = rest_helper.vbucket_map_ready(bucket.name) if not bucket_ready: self.fail("Bucket %s not created after 120 seconds." % bucket.name) remote_client = RemoteMachineShellConnection(self.backupset.backup_host) command = "{0}/cbbackupmgr {1}".format(self.cli_command_location, args) output, error = remote_client.execute_command(command) remote_client.log_command_output(output, error) return output, error
def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers, chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
def cleanup_cluster(servers, wait_for_rebalance=True, master = None): log = logger.Logger.get_logger() if master is None: master = servers[0] rest = RestConnection(master) helper = RestHelper(rest) helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id for node in nodes: if int(node.port) in xrange(9091, 9991): rest.eject_node(node) nodes.remove(node) if len(nodes) > 1: log.info("rebalancing all nodes in order to remove nodes") rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \ [node.id for node in nodes if node.id != master_id]) removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in nodes if node.id != master_id], wait_for_rebalance=wait_for_rebalance) success_cleaned = [] for removed in [node for node in nodes if (node.id != master_id)]: removed.rest_password = servers[0].rest_password removed.rest_username = servers[0].rest_username try: rest = RestConnection(removed) except Exception as ex: log.error("can't create rest connection after rebalance out for ejected nodes,\ will retry after 10 seconds according to MB-8430: {0} ".format(ex)) time.sleep(10) rest = RestConnection(removed) start = time.time() while time.time() - start < 30: if len(rest.get_pools_info()["pools"]) == 0: success_cleaned.append(removed) break else: time.sleep(0.1) if time.time() - start > 10: log.error("'pools' on node {0}:{1} - {2}".format( removed.ip, removed.port, rest.get_pools_info()["pools"])) for node in set([node for node in nodes if (node.id != master_id)]) - set(success_cleaned): log.error("node {0}:{1} was not cleaned after removing from cluster".format( removed.ip, removed.port)) try: rest = RestConnection(node) rest.force_eject_node() except Exception as ex: log.error("force_eject_node {0}:{1} failed: {2}".format(removed.ip, removed.port, ex)) if len(set([node for node in nodes if (node.id != master_id)])\ - set(success_cleaned)) != 0: raise Exception("not all ejected nodes were cleaned successfully") log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \ [(node.id, node.port) for node in nodes if (node.id != master_id)]))
def cleanup_cluster(servers, wait_for_rebalance=True, master = None): log = logger.Logger.get_logger() if master == None: master = servers[0] rest = RestConnection(master) helper = RestHelper(rest) helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id for node in nodes: if int(node.port) in xrange(9091, 9991): rest.eject_node(node) nodes.remove(node) if len(nodes) > 1: log.info("rebalancing all nodes in order to remove nodes") rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \ [node.id for node in nodes if node.id != master_id]) removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in nodes if node.id != master_id], wait_for_rebalance=wait_for_rebalance) success_cleaned = [] for removed in [node for node in nodes if (node.id != master_id)]: removed.rest_password = servers[0].rest_password removed.rest_username = servers[0].rest_username try: rest = RestConnection(removed) except Exception as ex: log.error("can't create rest connection after rebalance out for ejected nodes,\ will retry after 10 seconds according to MB-8430: {0} ".format(ex)) time.sleep(10) rest = RestConnection(removed) start = time.time() while time.time() - start < 30: if len(rest.get_pools_info()["pools"]) == 0: success_cleaned.append(removed) break else: time.sleep(0.1) if time.time() - start > 10: log.error("'pools' on node {0}:{1} - {2}".format( removed.ip, removed.port, rest.get_pools_info()["pools"])) for node in set([node for node in nodes if (node.id != master_id)]) - set(success_cleaned): log.error("node {0}:{1} was not cleaned after removing from cluster".format( removed.ip, removed.port)) try: rest = RestConnection(node) rest.force_eject_node() except Exception as ex: log.error("force_eject_node {0}:{1} failed: {2}".format(removed.ip, removed.port, ex)) if len(set([node for node in nodes if (node.id != master_id)])\ - set(success_cleaned)) != 0: raise Exception("not all ejected nodes were cleaned successfully") log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \ [(node.id, node.port) for node in nodes if (node.id != master_id)]))
def wait_for_bucket_creation(bucket, rest, timeout_in_seconds=120): log.info('waiting for bucket creation to complete....') start = time.time() helper = RestHelper(rest) while (time.time() - start) <= timeout_in_seconds: if helper.bucket_exists(bucket): return True else: time.sleep(2) return False
def wait_for_bucket_creation(bucket, rest, timeout_in_seconds=120): log.info("waiting for bucket creation to complete....") start = time.time() helper = RestHelper(rest) while (time.time() - start) <= timeout_in_seconds: if helper.bucket_exists(bucket): return True else: time.sleep(2) return False
def wait_for_bucket_deletion(bucket, rest, timeout_in_seconds=120): log = logger.Logger.get_logger() log.info('waiting for bucket deletion to complete....') start = time.time() helper = RestHelper(rest) while (time.time() - start) <= timeout_in_seconds: if not helper.bucket_exists(bucket): return True else: time.sleep(0.1) return False
def _create_default_bucket(self): helper = RestHelper(self.rest) if not helper.bucket_exists(self.bucket): node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers) info = self.rest.get_nodes_self() available_ram = int(info.memoryQuota * node_ram_ratio) if available_ram < 256: available_ram = 256 self.rest.create_bucket(bucket=self.bucket, ramQuotaMB=available_ram) ready = BucketOperationHelper.wait_for_memcached(self.master, self.bucket) self.testcase.assertTrue(ready, "wait_for_memcached failed") self.testcase.assertTrue(helper.bucket_exists(self.bucket), "unable to create {0} bucket".format(self.bucket))
def test_reset_count(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 2) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance())
def _create_default_bucket(self, replica=1): name = "default" master = self.servers[0] rest = RestConnection(master) helper = RestHelper(RestConnection(master)) if not helper.bucket_exists(name): node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram), replicaNumber=replica) ready = BucketOperationHelper.wait_for_memcached(master, name) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
def cleanup_cluster(servers, wait_for_rebalance=True): log = logger.Logger.get_logger() rest = RestConnection(servers[0]) helper = RestHelper(rest) helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id if len(nodes) > 1: log.info("rebalancing all nodes in order to remove nodes") removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in nodes if node.id != master_id], wait_for_rebalance=wait_for_rebalance) log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], removed))
def wait_for_bucket_deletion(bucket, rest, timeout_in_seconds=120): log = logger.Logger.get_logger() log.info('waiting for bucket deletion to complete....') start = time.time() helper = RestHelper(rest) while (time.time() - start) <= timeout_in_seconds: if not helper.bucket_exists(bucket): return True else: time.sleep(2) return False
def test_failed_rebalance_with_gsi_autofailover(self): self.bucket_params = self._create_bucket_params( server=self.master, size=self.bucket_size, replicas=self.num_replicas, bucket_type=self.bucket_type, enable_replica_index=self.enable_replica_index, eviction_policy=self.eviction_policy, lww=self.lww) self.cluster.create_standard_bucket(name=self.test_bucket, port=11222, bucket_params=self.bucket_params) self.buckets = self.rest.get_buckets() self.prepare_collection_for_indexing(num_of_docs_per_collection=10**5) self._create_indexes() # enable auto failover self.enable_autofailover_and_validate() # Start rebalance in rebalance_task = self.cluster.async_rebalance( servers=self.servers, to_add=self.servers_to_add, to_remove=self.servers_to_remove, services=['kv', 'index']) self.sleep(20) reached = RestHelper(self.rest).rebalance_reached(percentage=20) self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(20)) # Do a fail over action - reboot, hang, kill. This is defined in the conf file. Test sometimes fail # because the rebalance action is completed fast and there's no way to induce a failure. self.failover_actions[self.failover_action](self) try: rebalance_task.result() except Exception as err: self.log.info("Rebalance failed with : {0}".format(str(err))) if "Rebalance failed. See logs for detailed reason. You can try again" in str( err): self.log.info( "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}" .format(str(err))) elif not RestHelper(self.rest).is_cluster_rebalanced(): if self._auto_failover_message_present_in_logs( self.server_to_fail[0].ip): self.log.info( "Rebalance interrupted due to auto-failover of nodes - message was seen in logs" ) else: self.fail( "Rebalance interrupted message was not seen in logs") else: self.fail("Rebalance was not aborted by auto fail-over") self.disable_autofailover_and_validate()
def _create_default_bucket(self): name = "default" master = self.master rest = RestConnection(master) helper = RestHelper(RestConnection(master)) node_ram_ratio = BucketOperationHelper.base_bucket_ratio(TestInputSingleton.input.servers) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram)) ready = BucketOperationHelper.wait_for_memcached(master, name) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name)) self.load_thread = None self.shutdown_load_data = False
def _create_default_bucket(self): name = "default" master = self.master rest = RestConnection(master) helper = RestHelper(RestConnection(master)) if not helper.bucket_exists(name): node_ram_ratio = BucketOperationHelper.base_bucket_ratio(TestInputSingleton.input.servers) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram)) ready = BucketOperationHelper.wait_for_memcached(master, name) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
def setUp(self): super(NewUpgradeBaseTest, self).setUp() self.product = self.input.param('product', 'couchbase-server') self.initial_version = self.input.param('initial_version', '1.8.1-942-rel') self.initial_vbuckets = self.input.param('initial_vbuckets', 64) self.rest_settings = self.input.membase_settings self.rest = RestConnection(self.master) self.rest_helper = RestHelper(self.rest) self.sleep_time = 10 self.data_size = self.input.param('data_size', 1024) self.op_types = self.input.param('op_types', 'bucket') self.item_flag = self.input.param('item_flag', 4042322160) self.expire_time = self.input.param('expire_time', 0)
def test_reset_count(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 2) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance())
def _create_default_bucket(self, unittest): name = "default" master = self.master rest = RestConnection(master) helper = RestHelper(RestConnection(master)) if not helper.bucket_exists(name): node_ram_ratio = BucketOperationHelper.base_bucket_ratio(TestInputSingleton.input.servers) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram)) ready = BucketOperationHelper.wait_for_memcached(master, name) BucketOperationHelper.wait_for_vbuckets_ready_state(master, name) unittest.assertTrue(ready, msg="wait_for_memcached failed") unittest.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
def _create_default_bucket(self, replica=1): name = "default" master = self.servers[0] rest = RestConnection(master) helper = RestHelper(RestConnection(master)) if not helper.bucket_exists(name): node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram), replicaNumber=replica, storageBackend=self.bucket_storage) ready = BucketOperationHelper.wait_for_memcached(master, name) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
def wait_for_ns_servers_or_assert(servers, testcase, wait_time=360, wait_if_warmup=False, debug=True): for server in servers: rest = RestConnection(server) log = logger.Logger.get_logger() if debug: log.info("waiting for ns_server @ {0}:{1}".format( server.ip, server.port)) if RestHelper(rest).is_ns_server_running(wait_time): if debug: log.info("ns_server @ {0}:{1} is running".format( server.ip, server.port)) elif wait_if_warmup: # wait when warmup completed buckets = rest.get_buckets() for bucket in buckets: testcase.assertTrue(ClusterOperationHelper._wait_warmup_completed(testcase,\ [server], bucket.name, wait_time), "warmup was not completed!") else: testcase.fail("ns_server {0} is not running in {1} sec".format( server.ip, wait_time))
def replication_verification(master, bucket, replica, inserted_count, test): rest = RestConnection(master) nodes = rest.node_statuses() if len(nodes) / (1 + replica) >= 1: final_replication_state = RestHelper(rest).wait_for_replication( 900) msg = "replication state after waiting for up to 15 minutes : {0}" test.log.info(msg.format(final_replication_state)) # in windows, we need to set timeout_in_seconds to 15+ minutes test.assertTrue( RebalanceHelper.wait_till_total_numbers_match( master=master, bucket=bucket, timeout_in_seconds=1200), msg= "replication was completed but sum(curr_items) dont match the curr_items_total" ) start_time = time.time() stats = rest.get_bucket_stats() while time.time() < (start_time + 120 ) and stats["curr_items"] != inserted_count: test.log.info("curr_items : {0} versus {1}".format( stats["curr_items"], inserted_count)) time.sleep(5) stats = rest.get_bucket_stats() RebalanceHelper.print_taps_from_all_nodes(rest, bucket) test.log.info("curr_items : {0} versus {1}".format( stats["curr_items"], inserted_count)) stats = rest.get_bucket_stats() msg = "curr_items : {0} is not equal to actual # of keys inserted : {1}" test.assertEquals(stats["curr_items"], inserted_count, msg=msg.format(stats["curr_items"], inserted_count))
def add_node_and_rebalance(self, master, servers): ClusterOperationHelper.add_all_nodes_or_assert( master, servers, self.input.membase_settings, self) rest = RestConnection(master) nodes = rest.node_statuses() otpNodeIds = [] for node in nodes: otpNodeIds.append(node.id) rebalanceStarted = rest.rebalance(otpNodeIds, []) self.assertTrue( rebalanceStarted, "unable to start rebalance on master node {0}".format(master.ip)) self.log.info('started rebalance operation on master node {0}'.format( master.ip)) rebalanceSucceeded = rest.monitorRebalance() self.assertTrue( rebalanceSucceeded, "rebalance operation for nodes: {0} was not successful".format( otpNodeIds)) self.log.info( 'rebalance operaton succeeded for nodes: {0}'.format(otpNodeIds)) #now remove the nodes #make sure its rebalanced and node statuses are healthy helper = RestHelper(rest) self.assertTrue(helper.is_cluster_healthy, "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced, "cluster is not balanced")
def _create_default_bucket(self): rest = RestConnection(self.master) helper = RestHelper(RestConnection(self.master)) if not helper.bucket_exists(self.bucket): node_ram_ratio = BucketOperationHelper.base_bucket_ratio([self.master]) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio serverInfo = self.master rest.init_cluster(username=serverInfo.rest_username, password=serverInfo.rest_password) rest.init_cluster_memoryQuota(memoryQuota=int(info.mcdMemoryReserved * node_ram_ratio)) rest.create_bucket(bucket=self.bucket, ramQuotaMB=int(available_ram)) ready = BucketOperationHelper.wait_for_memcached(self.master, self.bucket) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(self.bucket), msg="unable to create {0} bucket".format(self.bucket))
def wait_for_rebalance_to_complete(self, task, wait_step=120): self.task.jython_task_manager.get_task_result(task) reached = RestHelper(self.rest).rebalance_reached(wait_step=wait_step) self.assertTrue(reached, "Rebalance failed, stuck or did not complete") self.assertTrue(task.result, "Rebalance Failed") if self.compaction: self.wait_for_compaction_to_complete()
def test_node_memcached_failure(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self._pause_couchbase(self.server_fail) self.sleep(5) AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) RemoteUtilHelper.common_basic_setup([self.server_fail]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_crash_while_streaming(self): vbucket = 0 nodeA = self.servers[0] n = 10000 self.load_docs(nodeA, vbucket, n) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, 2*n, 0) self.load_docs(nodeA, vbucket, n) assert self.stop_node(0) time.sleep(2) assert self.start_node(0) rest = RestHelper(RestConnection(nodeA)) assert rest.is_ns_server_running() time.sleep(2) _, _, high_seqno = self.vb_info(nodeA, vbucket) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, 0) stream.run() assert stream.last_by_seqno == high_seqno
def terminate(self): if self._xdcr: self._terminate_replications(self._s_master, "cluster1") if self._rdirection == "bidirection": self._terminate_replications(self._d_master, "cluster0") for key in self._clusters_keys_olst: nodes = self._clusters_dic[key] for node in nodes: rest = RestConnection(node) buckets = rest.get_buckets() for bucket in buckets: status = rest.delete_bucket(bucket.name) if status: self._log.info('Deleted bucket : {0} from {1}'.format(bucket.name, node.ip)) rest = RestConnection(nodes[0]) helper = RestHelper(rest) servers = rest.node_statuses() master_id = rest.get_nodes_self().id if len(nodes) > 1: removed = helper.remove_nodes(knownNodes=[node.id for node in servers], ejectedNodes=[node.id for node in servers if node.id != master_id], wait_for_rebalance=True )
def test_node_cb_restart(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.restart_couchbase() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_stream_after_n_crashes(self): crashes = 5 vbucket = 0 # load some data nodeA = self.servers[0] rest = RestHelper(RestConnection(nodeA)) for i in xrange(crashes): self.load_docs(nodeA, vbucket, self.num_items) assert self.stop_node(0) time.sleep(5) assert self.start_node(0) assert rest.is_ns_server_running() time.sleep(2) vb_uuid, _, high_seqno = self.vb_info(nodeA, vbucket) dcp_client = self.dcp_client(nodeA, PRODUCER) stream = dcp_client.stream_req( vbucket, 0, 0, high_seqno, vb_uuid) stream.run() assert stream.last_by_seqno == high_seqno
def verify_upgrade_rebalance_in_out(self): self.master = self.servers[self.initial_num_servers] self.rest = RestConnection(self.master) self.rest_helper = RestHelper(self.rest) for bucket in self.buckets: if self.rest_helper.bucket_exists(bucket.name): continue else: raise Exception("bucket:- %s not found" % bucket.name) if self.op_types == "bucket": bucketinfo = self.rest.get_bucket(bucket.name) self.log.info("bucket info :- %s" % bucketinfo) if self.op_types == "data": self._wait_for_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers]) self._verify_all_buckets(self.master, 1, self.wait_timeout * 50, self.max_verify, True, 1) self._verify_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers])
def test_node_memcached_failure_in_series(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(xrange(len(self.servers))): print self.servers[i] operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print "operation", operation if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_ui_logs(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) logs = self.rest.get_logs(5) self.assertTrue(u'Enabled auto-reprovision config with max_nodes set to 2' in [l['text'] for l in logs]) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision count reset from 0' in [l['text'] for l in logs]) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 2) logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision is disabled as maximum number of nodes (2) ' u'that can be auto-reprovisioned has been reached.' in [l['text'] for l in logs]) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision count reset from 2' in [l['text'] for l in logs]) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) logs = self.rest.get_logs(5) # https://issues.couchbase.com/browse/MB-24520 self.assertFalse(u'Reset auto-failover count' in [l['text'] for l in logs]) self.assertTrue(u'Rebalance completed successfully.' in [l['text'] for l in logs])
class MultiNodesUpgradeTests(NewUpgradeBaseTest): def setUp(self): super(MultiNodesUpgradeTests, self).setUp() if self.initial_version.startswith("1.6") or self.initial_version.startswith("1.7"): self.product = "membase-server" else: self.product = "couchbase-server" self.initial_num_servers = self.input.param("initial_num_servers", 2) def tearDown(self): super(MultiNodesUpgradeTests, self).tearDown() def offline_cluster_upgrade(self): self._install(self.servers[: self.initial_num_servers]) self.operations(multi_nodes=True) upgrade_versions = self.input.param("upgrade_version", "2.0.0-1870-rel") upgrade_versions = upgrade_versions.split(";") self.log.info("Installation done going to sleep for %s sec", self.sleep_time) time.sleep(self.sleep_time) for upgrade_version in upgrade_versions: for server in self.servers[: self.initial_num_servers]: remote = RemoteMachineShellConnection(server) remote.stop_server() time.sleep(self.sleep_time) remote.disconnect() for server in self.servers[: self.initial_num_servers]: remote = RemoteMachineShellConnection(server) self._upgrade(upgrade_version, server, remote) time.sleep(self.sleep_time) remote.disconnect() time.sleep(self.expire_time) self.num_servers = self.initial_num_servers self.verification(multi_nodes=True) def online_upgrade_rebalance_in_out(self): self._install(self.servers[: self.initial_num_servers]) self.operations(multi_nodes=True) self.log.info("Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time)) time.sleep(self.sleep_time) upgrade_version = self.input.param("upgrade_version", "2.0.0-1870-rel") self.initial_version = upgrade_version self.product = "couchbase-server" self._install(self.servers[self.initial_num_servers : self.num_servers]) self.log.info("Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time)) time.sleep(self.sleep_time) servers_in = self.servers[self.initial_num_servers : self.num_servers] self.cluster.rebalance(self.servers[: self.initial_num_servers], servers_in, []) self.log.info("Rebalance in all 2.0 Nodes") time.sleep(self.sleep_time) status, content = ClusterHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 2.0 Nodes, 2.0 doesn't become the master") servers_out = self.servers[: self.initial_num_servers] self.cluster.rebalance(self.servers[: self.num_servers], [], servers_out) self.log.info("Rebalance out all old version nodes") time.sleep(self.sleep_time) self.verify_upgrade_rebalance_in_out() def verify_upgrade_rebalance_in_out(self): self.master = self.servers[self.initial_num_servers] self.rest = RestConnection(self.master) self.rest_helper = RestHelper(self.rest) for bucket in self.buckets: if self.rest_helper.bucket_exists(bucket.name): continue else: raise Exception("bucket:- %s not found" % bucket.name) if self.op_types == "bucket": bucketinfo = self.rest.get_bucket(bucket.name) self.log.info("bucket info :- %s" % bucketinfo) if self.op_types == "data": self._wait_for_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers]) self._verify_all_buckets(self.master, 1, self.wait_timeout * 50, self.max_verify, True, 1) self._verify_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers]) def online_upgrade_swap_rebalance(self): self._install(self.servers[: self.initial_num_servers]) self.operations(multi_nodes=True) self.log.info("Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time)) time.sleep(self.sleep_time) upgrade_version = self.input.param("upgrade_version", "2.0.0-1870-rel") self.initial_version = upgrade_version self.product = "couchbase-server" self._install(self.servers[self.initial_num_servers : self.num_servers]) self.log.info("Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time)) time.sleep(self.sleep_time) self.swap_num_servers = self.input.param("swap_num_servers", 1) old_servers = self.servers[: self.initial_num_servers] new_servers = [] for i in range(self.initial_num_servers / self.swap_num_servers): servers_in = self.servers[ (self.initial_num_servers + i * self.swap_num_servers) : ( self.initial_num_servers + (i + 1) * self.swap_num_servers ) ] servers_out = self.servers[(i * self.swap_num_servers) : ((i + 1) * self.swap_num_servers)] servers = old_servers + new_servers self.cluster.rebalance(servers, servers_in, servers_out) self.log.info( "Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes" % (self.swap_num_servers, self.swap_num_servers) ) time.sleep(self.sleep_time) old_servers = self.servers[((i + 1) * self.swap_num_servers) : self.initial_num_servers] new_servers = new_servers + servers_in servers = old_servers + new_servers status, content = ClusterHelper.find_orchestrator(servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) FIND_MASTER = False for new_server in new_servers: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 2.0 nodes, 2.0 doesn't become the master ") self.verify_upgrade_rebalance_in_out()
class NewUpgradeBaseTest(BaseTestCase): def setUp(self): super(NewUpgradeBaseTest, self).setUp() self.product = self.input.param('product', 'couchbase-server') self.initial_version = self.input.param('initial_version', '1.8.1-942-rel') self.initial_vbuckets = self.input.param('initial_vbuckets', 64) self.rest_settings = self.input.membase_settings self.rest = RestConnection(self.master) self.rest_helper = RestHelper(self.rest) self.sleep_time = 10 self.data_size = self.input.param('data_size', 1024) self.op_types = self.input.param('op_types', 'bucket') self.item_flag = self.input.param('item_flag', 4042322160) self.expire_time = self.input.param('expire_time', 0) def tearDown(self): super(NewUpgradeBaseTest, self).tearDown() def _install(self, servers): params = {} params['num_nodes'] = len(servers) params['product'] = self.product params['version'] = self.initial_version params['vbuckets'] = [self.initial_vbuckets] InstallerJob().parallel_install(servers, params) if self.product in ["couchbase", "couchbase-server", "cb"]: success = True for server in servers: success &= RemoteMachineShellConnection(server).is_couchbase_installed() if not success: self.log.info("some nodes were not install successfully!") sys.exit(1) def operations(self, multi_nodes=False): self.quota = self._initialize_nodes(self.cluster, self.servers, self.disabled_consistent_view) self.buckets = [] gc.collect() if self.total_buckets > 0: self.bucket_size = self._get_bucket_size(self.quota, self.total_buckets) if self.default_bucket: self.cluster.create_default_bucket(self.master, self.bucket_size, self.num_replicas) self.buckets.append(Bucket(name="default", authType="sasl", saslPassword="", num_replicas=self.num_replicas, bucket_size=self.bucket_size)) self._create_sasl_buckets(self.master, self.sasl_buckets) self._create_standard_buckets(self.master, self.standard_buckets) if multi_nodes: servers_in = [self.servers[i+1] for i in range(self.initial_num_servers-1)] self.cluster.rebalance(self.servers[:1], servers_in, []) if self.op_types == "data": self._load_data_all_buckets("create") if multi_nodes: self._wait_for_stats_all_buckets(self.servers[:self.initial_num_servers]) else: self._wait_for_stats_all_buckets([self.master]) def _load_data_all_buckets(self, op_type='create', start=0): loaded = False count = 0 gen_load = BlobGenerator('upgrade-', 'upgrade-', self.data_size, start=start, end=self.num_items) while not loaded and count < 60: try : self._load_all_buckets(self.master, gen_load, op_type, self.expire_time, 1, self.item_flag, True, batch_size=20000, pause_secs=5, timeout_secs=180) loaded = True except MemcachedError as error: if error.status == 134: loaded = False self.log.error("Memcached error 134, wait for 5 seconds and then try again") count += 1 time.sleep(self.sleep_time) def _get_build(self, server, version, remote, is_amazon=False): info = remote.extract_remote_info() builds, changes = BuildQuery().get_all_builds() self.log.info("finding build %s for machine %s" % (version, server)) result = re.search('r', version) if result is None: appropriate_build = BuildQuery().\ find_membase_release_build('%s-enterprise' % (self.product), info.deliverable_type, info.architecture_type, version.strip(), is_amazon=is_amazon) else: appropriate_build = BuildQuery().\ find_membase_build(builds, '%s-enterprise' % (self.product), info.deliverable_type, info.architecture_type, version.strip(), is_amazon=is_amazon) return appropriate_build def _upgrade(self, upgrade_version, server, remote): appropriate_build = self._get_build(server, upgrade_version, remote) self.assertTrue(appropriate_build.url, msg="unable to find build {0}".format(upgrade_version)) remote.download_build(appropriate_build) remote.membase_upgrade(appropriate_build, save_upgrade_config=False) self.rest_helper.is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) self.rest.init_cluster_port(self.rest_settings.rest_username, self.rest_settings.rest_password) time.sleep(self.sleep_time) def verification(self, multi_nodes=False): for bucket in self.buckets: if self.rest_helper.bucket_exists(bucket.name): continue else: raise Exception("bucket:- %s not found" % bucket.name) if self.op_types == "bucket": bucketinfo = self.rest.get_bucket(bucket.name) self.log.info("bucket info :- %s" % bucketinfo) if self.op_types == "data": if multi_nodes: self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self._verify_all_buckets(self.master, 1, self.wait_timeout*50, self.max_verify, True, 1) self._verify_stats_all_buckets(self.servers[:self.num_servers]) else: self._wait_for_stats_all_buckets([self.master]) self._verify_all_buckets(self.master, 1, self.wait_timeout*50, self.max_verify, True, 1) self._verify_stats_all_buckets([self.master])
def backup_restore(self): try: backup_start = self.backups[int(self.backupset.start) - 1] except IndexError: backup_start = "{0}{1}".format(self.backups[-1], self.backupset.start) try: backup_end = self.backups[int(self.backupset.end) - 1] except IndexError: backup_end = "{0}{1}".format(self.backups[-1], self.backupset.end) args = ( "restore --archive {0} --repo {1} {2} http://{3}:{4} --username {5} " "--password {6} --start {7} --end {8}".format( self.backupset.directory, self.backupset.name, self.cluster_flag, self.backupset.restore_cluster_host.ip, self.backupset.restore_cluster_host.port, self.backupset.restore_cluster_host_username, self.backupset.restore_cluster_host_password, backup_start, backup_end, ) ) if self.backupset.exclude_buckets: args += " --exclude-buckets {0}".format(self.backupset.exclude_buckets) if self.backupset.include_buckets: args += " --include-buckets {0}".format(self.backupset.include_buckets) if self.backupset.disable_bucket_config: args += " --disable-bucket-config {0}".format(self.backupset.disable_bucket_config) if self.backupset.disable_views: args += " --disable-views {0}".format(self.backupset.disable_views) if self.backupset.disable_gsi_indexes: args += " --disable-gsi-indexes {0}".format(self.backupset.disable_gsi_indexes) if self.backupset.disable_ft_indexes: args += " --disable-ft-indexes {0}".format(self.backupset.disable_ft_indexes) if self.backupset.disable_data: args += " --disable-data {0}".format(self.backupset.disable_data) if self.backupset.disable_conf_res_restriction is not None: args += " --disable-conf-res-restriction {0}".format(self.backupset.disable_conf_res_restriction) if self.backupset.filter_keys: args += " --filter_keys {0}".format(self.backupset.filter_keys) if self.backupset.filter_values: args += " --filter_values {0}".format(self.backupset.filter_values) if self.backupset.force_updates: args += " --force-updates" if self.no_progress_bar: args += " --no-progress-bar" if not self.skip_buckets: rest_conn = RestConnection(self.backupset.restore_cluster_host) rest_helper = RestHelper(rest_conn) for bucket in self.buckets: if not rest_helper.bucket_exists(bucket.name): self.log.info( "Creating bucket {0} in restore host {1}".format( bucket.name, self.backupset.restore_cluster_host.ip ) ) rest_conn.create_bucket( bucket=bucket.name, ramQuotaMB=512, authType=bucket.authType if bucket.authType else "none", proxyPort=bucket.port, saslPassword=bucket.saslPassword, lww=self.lww_new, ) bucket_ready = rest_helper.vbucket_map_ready(bucket.name) if not bucket_ready: self.fail("Bucket %s not created after 120 seconds." % bucket.name) remote_client = RemoteMachineShellConnection(self.backupset.backup_host) command = "{0}/cbbackupmgr {1}".format(self.cli_command_location, args) output, error = remote_client.execute_command(command) remote_client.log_command_output(output, error) res = output res.extend(error) error_str = "Error restoring cluster: Transfer failed. Check the logs for more information." if error_str in res: command = "cat " + self.backupset.directory + "/logs/backup.log | grep '" + error_str + "' -A 10 -B 100" output, error = remote_client.execute_command(command) remote_client.log_command_output(output, error) if "Required Flags:" in res: self.fail("Command line failed. Please check test params.") return output, error
def test_backup_upgrade_restore_default(self): if len(self.servers) < 2: self.log.error("At least 2 servers required for this test ..") return original_set = copy.copy(self.servers) worker = self.servers[len(self.servers) - 1] self.servers = self.servers[:len(self.servers)-1] shell = RemoteMachineShellConnection(self.master) o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt") fin = o[0] shell.disconnect() initial_version = self.input.param("initial_version", fin) final_version = self.input.param("final_version", fin) if initial_version==final_version: self.log.error("Same initial and final versions ..") return if not final_version.startswith('2.0'): self.log.error("Upgrade test not set to run from 1.8.1 -> 2.0 ..") return builds, changes = BuildQuery().get_all_builds(version=final_version) product = 'couchbase-server-enterprise' #CASE where the worker isn't a 2.0+ worker_flag = 0 shell = RemoteMachineShellConnection(worker) o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt") temp = o[0] if not temp.startswith('2.0'): worker_flag = 1 if worker_flag == 1: self.log.info("Loading version {0} on worker.. ".format(final_version)) remote = RemoteMachineShellConnection(worker) info = remote.extract_remote_info() older_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, final_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(older_build) remote.install_server(older_build) remote.disconnect() remote_tmp = "{1}/{0}".format("backup", "/root") perm_comm = "mkdir -p {0}".format(remote_tmp) if not initial_version == fin: for server in self.servers: remote = RemoteMachineShellConnection(server) info = remote.extract_remote_info() self.log.info("Loading version .. {0}".format(initial_version)) older_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, initial_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(older_build) remote.install_server(older_build) rest = RestConnection(server) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster(server.rest_username, server.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect() self.common_setUp() bucket = "default" if len(self.servers) > 1: self.add_nodes_and_rebalance() rest = RestConnection(self.master) info = rest.get_nodes_self() size = int(info.memoryQuota * 2.0 / 3.0) rest.create_bucket(bucket, ramQuotaMB=size) ready = BucketOperationHelper.wait_for_memcached(self.master, bucket) self.assertTrue(ready, "wait_for_memcached_failed") distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05} inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master], name=bucket, ram_load_ratio=0.5, value_size_distribution=distribution, moxi=True, write_only=True, delete_ratio=0.1, number_of_threads=2) if len(self.servers) > 1: rest = RestConnection(self.master) self.assertTrue(RebalanceHelper.wait_for_replication(rest.get_nodes(), timeout=180), msg="replication did not complete") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0) self.assertTrue(ready, "wait_for ep_queue_size == 0 failed") node = RestConnection(self.master).get_nodes_self() shell = RemoteMachineShellConnection(worker) o, r = shell.execute_command(perm_comm) shell.log_command_output(o, r) shell.disconnect() #Backup #BackupHelper(self.master, self).backup(bucket, node, remote_tmp) shell = RemoteMachineShellConnection(worker) shell.execute_command("/opt/couchbase/bin/cbbackup http://{0}:{1} {2}".format( self.master.ip, self.master.port, remote_tmp)) shell.disconnect() BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) time.sleep(30) #Upgrade for server in self.servers: self.log.info("Upgrading to current version {0}".format(final_version)) remote = RemoteMachineShellConnection(server) info = remote.extract_remote_info() new_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, final_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(new_build) remote.install_server(new_build) rest = RestConnection(server) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster(server.rest_username, server.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect() time.sleep(30) #Restore rest = RestConnection(self.master) info = rest.get_nodes_self() size = int(info.memoryQuota * 2.0 / 3.0) rest.create_bucket(bucket, ramQuotaMB=size) ready = BucketOperationHelper.wait_for_memcached(server, bucket) self.assertTrue(ready, "wait_for_memcached_failed") #BackupHelper(self.master, self).restore(backup_location=remote_tmp, moxi_port=info.moxi) shell = RemoteMachineShellConnection(worker) shell.execute_command("/opt/couchbase/bin/cbrestore {2} http://{0}:{1} -b {3}".format( self.master.ip, self.master.port, remote_tmp, bucket)) shell.disconnect() time.sleep(60) keys_exist = BucketOperationHelper.keys_exist_or_assert_in_parallel(inserted_keys, self.master, bucket, self, concurrency=4) self.assertTrue(keys_exist, msg="unable to verify keys after restore") time.sleep(30) BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self) rest = RestConnection(self.master) helper = RestHelper(rest) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id if len(self.servers) > 1: removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in nodes if node.id != master_id], wait_for_rebalance=True ) shell = RemoteMachineShellConnection(worker) shell.remove_directory(remote_tmp) shell.disconnect() self.servers = copy.copy(original_set) if initial_version == fin: builds, changes = BuildQuery().get_all_builds(version=initial_version) for server in self.servers: remote = RemoteMachineShellConnection(server) info = remote.extract_remote_info() self.log.info("Loading version .. {0}".format(initial_version)) older_build = BuildQuery().find_build(builds, product, info.deliverable_type, info.architecture_type, initial_version) remote.stop_couchbase() remote.couchbase_uninstall() remote.download_build(older_build) remote.install_server(older_build) rest = RestConnection(server) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster(server.rest_username, server.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect()
class PerfBase(unittest.TestCase): """ specURL = http://hub.internal.couchbase.org/confluence/display/cbit/Black+Box+Performance+Test+Matrix """ # The setUpBaseX() methods allow subclasses to resequence the setUp() and # skip cluster configuration. def setUpBase0(self): self.log = logger.Logger.get_logger() self.input = TestInputSingleton.input self.vbucket_count = PerfDefaults.vbuckets self.sc = None if self.parami("tear_down_on_setup", PerfDefaults.tear_down_on_setup) == 1: self.tearDown() # Tear down in case previous run had unclean death master = self.input.servers[0] self.set_up_rest(master) def setUpBase1(self): if self.parami('num_buckets', 1) > 1: bucket = 'bucket-0' else: bucket = self.param('bucket', 'default') vBuckets = self.rest.get_vbuckets(bucket) self.vbucket_count = len(vBuckets) def setUp(self): self.setUpBase0() master = self.input.servers[0] self.is_multi_node = False self.data_path = master.data_path # Number of items loaded by load() method. # Does not include or count any items that came from set_up_dgm(). # self.num_items_loaded = 0 if self.input.clusters: for cluster in self.input.clusters.values(): master = cluster[0] self.set_up_rest(master) self.set_up_cluster(master) else: master = self.input.servers[0] self.set_up_cluster(master) # Rebalance num_nodes = self.parami("num_nodes", 10) self.rebalance_nodes(num_nodes) if self.input.clusters: for cluster in self.input.clusters.values(): master = cluster[0] self.set_up_rest(master) self.set_up_buckets() else: self.set_up_buckets() self.set_up_proxy() if self.input.clusters: for cluster in self.input.clusters.values(): master = cluster[0] self.set_up_rest(master) self.reconfigure() else: self.reconfigure() if self.parami("dgm", getattr(self, "dgm", 1)): self.set_up_dgm() time.sleep(10) self.setUpBase1() if self.input.clusters: for cluster in self.input.clusters.values(): self.wait_until_warmed_up(cluster[0]) else: self.wait_until_warmed_up() ClusterOperationHelper.flush_os_caches(self.input.servers) def set_up_rest(self, master): self.rest = RestConnection(master) self.rest_helper = RestHelper(self.rest) def set_up_cluster(self, master): """Initialize cluster""" print "[perf.setUp] Setting up cluster" self.rest.init_cluster(master.rest_username, master.rest_password) memory_quota = self.parami('mem_quota', PerfDefaults.mem_quota) self.rest.init_cluster_memoryQuota(master.rest_username, master.rest_password, memoryQuota=memory_quota) def set_up_buckets(self): """Set up data bucket(s)""" print "[perf.setUp] Setting up buckets" num_buckets = self.parami('num_buckets', 1) if num_buckets > 1: self.buckets = ['bucket-{0}'.format(i) for i in range(num_buckets)] else: self.buckets = [self.param('bucket', 'default')] for bucket in self.buckets: bucket_ram_quota = self.parami('mem_quota', PerfDefaults.mem_quota) bucket_ram_quota = bucket_ram_quota / num_buckets replicas = self.parami('replicas', getattr(self, 'replicas', 1)) self.rest.create_bucket(bucket=bucket, ramQuotaMB=bucket_ram_quota, replicaNumber=replicas, authType='sasl') status = self.rest_helper.vbucket_map_ready(bucket, 60) self.assertTrue(status, msg='vbucket_map not ready .. timed out') status = self.rest_helper.bucket_exists(bucket) self.assertTrue(status, msg='unable to create {0} bucket'.format(bucket)) def reconfigure(self): """Customize basic Couchbase setup""" print "[perf.setUp] Customizing setup" self.set_loglevel() self.set_max_concurrent_reps_per_doc() self.set_autocompaction() def set_loglevel(self): """Set custom loglevel""" loglevel = self.param('loglevel', None) if loglevel: self.rest.set_global_loglevel(loglevel) def set_max_concurrent_reps_per_doc(self): """Set custom MAX_CONCURRENT_REPS_PER_DOC""" max_concurrent_reps_per_doc = self.param('max_concurrent_reps_per_doc', None) if max_concurrent_reps_per_doc: for server in self.input.servers: rc = RemoteMachineShellConnection(server) rc.set_environment_variable('MAX_CONCURRENT_REPS_PER_DOC', max_concurrent_reps_per_doc) def set_ep_compaction(self, comp_ratio): """Set up ep_engine side compaction ratio""" for server in self.input.servers: shell = RemoteMachineShellConnection(server) cmd = "/opt/couchbase/bin/cbepctl localhost:11210 "\ "set flush_param db_frag_threshold {0}".format(comp_ratio) self._exec_and_log(shell, cmd) shell.disconnect() def set_autocompaction(self, disable_view_compaction=False): """Set custom auto-compaction settings""" try: # Parallel database and view compaction parallel_compaction = self.param("parallel_compaction", PerfDefaults.parallel_compaction) # Database fragmentation threshold db_compaction = self.parami("db_compaction", PerfDefaults.db_compaction) print "[perf.setUp] database compaction = %d" % db_compaction # ep_engine fragementation threshold ep_compaction = self.parami("ep_compaction", PerfDefaults.ep_compaction) self.set_ep_compaction(ep_compaction) print "[perf.setUp] ep_engine compaction = %d" % ep_compaction # View fragmentation threshold if disable_view_compaction: view_compaction = 100 else: view_compaction = self.parami("view_compaction", PerfDefaults.view_compaction) # Set custom auto-compaction settings self.rest.set_auto_compaction(parallelDBAndVC=parallel_compaction, dbFragmentThresholdPercentage=db_compaction, viewFragmntThresholdPercentage=view_compaction) except Exception as e: # It's very hard to determine what exception it can raise. # Therefore we have to use general handler. print "ERROR while changing compaction settings: {0}".format(e) def tearDown(self): if self.parami("tear_down", 0) == 1: print "[perf.tearDown] tearDown routine skipped" return print "[perf.tearDown] tearDown routine starts" if self.parami("tear_down_proxy", 1) == 1: self.tear_down_proxy() else: print "[perf.tearDown] Proxy tearDown skipped" if self.sc is not None: self.sc.stop() self.sc = None if self.parami("tear_down_bucket", 0) == 1: self.tear_down_buckets() else: print "[perf.tearDown] Bucket tearDown skipped" if self.parami("tear_down_cluster", 1) == 1: self.tear_down_cluster() else: print "[perf.tearDown] Cluster tearDown skipped" print "[perf.tearDown] tearDown routine finished" def tear_down_buckets(self): print "[perf.tearDown] Tearing down bucket" BucketOperationHelper.delete_all_buckets_or_assert(self.input.servers, self) print "[perf.tearDown] Bucket teared down" def tear_down_cluster(self): print "[perf.tearDown] Tearing down cluster" ClusterOperationHelper.cleanup_cluster(self.input.servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(self.input.servers, self) print "[perf.tearDown] Cluster teared down" def set_up_proxy(self, bucket=None): """Set up and start Moxi""" if self.input.moxis: print '[perf.setUp] Setting up proxy' bucket = bucket or self.param('bucket', 'default') shell = RemoteMachineShellConnection(self.input.moxis[0]) shell.start_moxi(self.input.servers[0].ip, bucket, self.input.moxis[0].port) shell.disconnect() def tear_down_proxy(self): if len(self.input.moxis) > 0: shell = RemoteMachineShellConnection(self.input.moxis[0]) shell.stop_moxi() shell.disconnect() # Returns "host:port" of moxi to hit. def target_host_port(self, bucket='default', use_direct=False): rv = self.param('moxi', None) if use_direct: return "%s:%s" % (self.input.servers[0].ip, '11210') if rv: return rv if len(self.input.moxis) > 0: return "%s:%s" % (self.input.moxis[0].ip, self.input.moxis[0].port) return "%s:%s" % (self.input.servers[0].ip, self.rest.get_bucket(bucket).nodes[0].moxi) def protocol_parse(self, protocol_in, use_direct=False): if protocol_in.find('://') >= 0: if protocol_in.find("couchbase:") >= 0: protocol = "couchbase" else: protocol = \ '-'.join(((["membase"] + protocol_in.split("://"))[-2] + "-binary").split('-')[0:2]) host_port = ('@' + protocol_in.split("://")[-1]).split('@')[-1] user, pswd = (('@' + protocol_in.split("://")[-1]).split('@')[-2] + ":").split(':')[0:2] else: protocol = 'memcached-' + protocol_in host_port = self.target_host_port(use_direct=use_direct) user = self.param("rest_username", "Administrator") pswd = self.param("rest_password", "password") return protocol, host_port, user, pswd def mk_protocol(self, host, port='8091', prefix='membase-binary'): return self.param('protocol', prefix + '://' + host + ':' + port) def restartProxy(self, bucket=None): self.tear_down_proxy() self.set_up_proxy(bucket) def set_up_dgm(self): """Download fragmented, DGM dataset onto each cluster node, if not already locally available. The number of vbuckets and database schema must match the target cluster. Shutdown all cluster nodes. Do a cluster-restore. Restart all cluster nodes.""" bucket = self.param("bucket", "default") ClusterOperationHelper.stop_cluster(self.input.servers) for server in self.input.servers: remote = RemoteMachineShellConnection(server) #TODO: Better way to pass num_nodes and db_size? self.get_data_files(remote, bucket, 1, 10) remote.disconnect() ClusterOperationHelper.start_cluster(self.input.servers) def get_data_files(self, remote, bucket, num_nodes, db_size): base = 'https://s3.amazonaws.com/database-analysis' dir = '/tmp/' if remote.is_couchbase_installed(): dir = dir + '/couchbase/{0}-{1}-{2}/'.format(num_nodes, 256, db_size) output, error = remote.execute_command('mkdir -p {0}'.format(dir)) remote.log_command_output(output, error) file = '{0}_cb.tar.gz'.format(bucket) base_url = base + '/couchbase/{0}-{1}-{2}/{3}'.format(num_nodes, 256, db_size, file) else: dir = dir + '/membase/{0}-{1}-{2}/'.format(num_nodes, 1024, db_size) output, error = remote.execute_command('mkdir -p {0}'.format(dir)) remote.log_command_output(output, error) file = '{0}_mb.tar.gz'.format(bucket) base_url = base + '/membase/{0}-{1}-{2}/{3}'.format(num_nodes, 1024, db_size, file) info = remote.extract_remote_info() wget_command = 'wget' if info.type.lower() == 'windows': wget_command = \ "cd {0} ;cmd /c 'c:\\automation\\wget.exe --no-check-certificate"\ .format(dir) # Check if the file exists on the remote server else download the gzipped version # Extract if necessary exist = remote.file_exists(dir, file) if not exist: additional_quote = "" if info.type.lower() == 'windows': additional_quote = "'" command = "{0} -v -O {1}{2} {3} {4} ".format(wget_command, dir, file, base_url, additional_quote) output, error = remote.execute_command(command) remote.log_command_output(output, error) if remote.is_couchbase_installed(): if info.type.lower() == 'windows': destination_folder = testconstants.WIN_COUCHBASE_DATA_PATH else: destination_folder = testconstants.COUCHBASE_DATA_PATH else: if info.type.lower() == 'windows': destination_folder = testconstants.WIN_MEMBASE_DATA_PATH else: destination_folder = testconstants.MEMBASE_DATA_PATH if self.data_path: destination_folder = self.data_path untar_command = 'cd {1}; tar -xzf {0}'.format(dir + file, destination_folder) output, error = remote.execute_command(untar_command) remote.log_command_output(output, error) def _exec_and_log(self, shell, cmd): """helper method to execute a command and log output""" if not cmd or not shell: return output, error = shell.execute_command(cmd) shell.log_command_output(output, error) def _build_tar_name(self, bucket, version="unknown_version", file_base=None): """build tar file name. {file_base}-{version}-{bucket}.tar.gz """ if not file_base: file_base = os.path.splitext( os.path.basename(self.param("conf_file", PerfDefaults.conf_file)))[0] return "{0}-{1}-{2}.tar.gz".format(file_base, version, bucket) def _save_snapshot(self, server, bucket, file_base=None): """Save data files to a snapshot""" src_data_path = os.path.dirname(server.data_path or testconstants.COUCHBASE_DATA_PATH) dest_data_path = "{0}-snapshots".format(src_data_path) print "[perf: _save_snapshot] server = {0} , src_data_path = {1}, dest_data_path = {2}"\ .format(server.ip, src_data_path, dest_data_path) shell = RemoteMachineShellConnection(server) build_name, short_version, full_version = \ shell.find_build_version("/opt/couchbase/", "VERSION.txt", "cb") dest_file = self._build_tar_name(bucket, full_version, file_base) self._exec_and_log(shell, "mkdir -p {0}".format(dest_data_path)) # save as gzip file, if file exsits, overwrite # TODO: multiple buckets zip_cmd = "cd {0}; tar -cvzf {1}/{2} {3} {3}-data _*"\ .format(src_data_path, dest_data_path, dest_file, bucket) self._exec_and_log(shell, zip_cmd) shell.disconnect() return True def _load_snapshot(self, server, bucket, file_base=None, overwrite=True): """Load data files from a snapshot""" dest_data_path = os.path.dirname(server.data_path or testconstants.COUCHBASE_DATA_PATH) src_data_path = "{0}-snapshots".format(dest_data_path) print "[perf: _load_snapshot] server = {0} , src_data_path = {1}, dest_data_path = {2}"\ .format(server.ip, src_data_path, dest_data_path) shell = RemoteMachineShellConnection(server) build_name, short_version, full_version = \ shell.find_build_version("/opt/couchbase/", "VERSION.txt", "cb") src_file = self._build_tar_name(bucket, full_version, file_base) if not shell.file_exists(src_data_path, src_file): print "[perf: _load_snapshot] file '{0}/{1}' does not exist"\ .format(src_data_path, src_file) shell.disconnect() return False if not overwrite: self._save_snapshot(server, bucket, "{0}.tar.gz".format( time.strftime(PerfDefaults.strftime))) # TODO: filename rm_cmd = "rm -rf {0}/{1} {0}/{1}-data {0}/_*".format(dest_data_path, bucket) self._exec_and_log(shell, rm_cmd) unzip_cmd = "cd {0}; tar -xvzf {1}/{2}".format(dest_data_path, src_data_path, src_file) self._exec_and_log(shell, unzip_cmd) shell.disconnect() return True def save_snapshots(self, file_base, bucket): """Save snapshots on all servers""" if not self.input.servers or not bucket: print "[perf: save_snapshot] invalid server list or bucket name" return False ClusterOperationHelper.stop_cluster(self.input.servers) for server in self.input.servers: self._save_snapshot(server, bucket, file_base) ClusterOperationHelper.start_cluster(self.input.servers) return True def load_snapshots(self, file_base, bucket): """Load snapshots on all servers""" if not self.input.servers or not bucket: print "[perf: load_snapshot] invalid server list or bucket name" return False ClusterOperationHelper.stop_cluster(self.input.servers) for server in self.input.servers: if not self._load_snapshot(server, bucket, file_base): ClusterOperationHelper.start_cluster(self.input.servers) return False ClusterOperationHelper.start_cluster(self.input.servers) return True def spec(self, reference): self.spec_reference = self.param("spec", reference) self.log.info("spec: " + reference) def mk_stats(self, verbosity): return StatsCollector(verbosity) def _get_src_version(self): """get testrunner version""" try: result = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).communicate()[0] except subprocess.CalledProcessError as e: print "[perf] unable to get src code version : {0}".format(str(e)) return "unknown version" return result.rstrip()[:7] def start_stats(self, stats_spec, servers=None, process_names=['memcached', 'beam.smp', 'couchjs'], test_params=None, client_id='', collect_server_stats=True, ddoc=None): if self.parami('stats', 1) == 0: return None servers = servers or self.input.servers sc = self.mk_stats(False) bucket = self.param("bucket", "default") sc.start(servers, bucket, process_names, stats_spec, 10, client_id, collect_server_stats=collect_server_stats, ddoc=ddoc) test_params['testrunner'] = self._get_src_version() self.test_params = test_params self.sc = sc return self.sc def end_stats(self, sc, total_stats=None, stats_spec=None): if sc is None: return if stats_spec is None: stats_spec = self.spec_reference if total_stats: sc.total_stats(total_stats) self.log.info("stopping stats collector") sc.stop() self.log.info("stats collector is stopped") sc.export(stats_spec, self.test_params) def load(self, num_items, min_value_size=None, kind='binary', protocol='binary', ratio_sets=1.0, ratio_hot_sets=0.0, ratio_hot_gets=0.0, ratio_expirations=0.0, expiration=None, prefix="", doc_cache=1, use_direct=True, report=0, start_at= -1, collect_server_stats=True, is_eperf=False, hot_shift=0): cfg = {'max-items': num_items, 'max-creates': num_items, 'max-ops-per-sec': self.parami("load_mcsoda_max_ops_sec", PerfDefaults.mcsoda_max_ops_sec), 'min-value-size': min_value_size or self.parami("min_value_size", 1024), 'ratio-sets': self.paramf("load_ratio_sets", ratio_sets), 'ratio-misses': self.paramf("load_ratio_misses", 0.0), 'ratio-creates': self.paramf("load_ratio_creates", 1.0), 'ratio-deletes': self.paramf("load_ratio_deletes", 0.0), 'ratio-hot': 0.0, 'ratio-hot-sets': ratio_hot_sets, 'ratio-hot-gets': ratio_hot_gets, 'ratio-expirations': ratio_expirations, 'expiration': expiration or 0, 'exit-after-creates': 1, 'json': int(kind == 'json'), 'batch': self.parami("batch", PerfDefaults.batch), 'vbuckets': self.vbucket_count, 'doc-cache': doc_cache, 'prefix': prefix, 'report': report, 'hot-shift': hot_shift, 'cluster_name': self.param("cluster_name", "")} cur = {} if start_at >= 0: cur['cur-items'] = start_at cur['cur-gets'] = start_at cur['cur-sets'] = start_at cur['cur-ops'] = cur['cur-gets'] + cur['cur-sets'] cur['cur-creates'] = start_at cfg['max-creates'] = start_at + num_items cfg['max-items'] = cfg['max-creates'] cfg_params = cfg.copy() cfg_params['test_time'] = time.time() cfg_params['test_name'] = self.id() # phase: 'load' or 'reload' phase = "load" if self.parami("hot_load_phase", 0) == 1: phase = "reload" if is_eperf: collect_server_stats = self.parami("prefix", 0) == 0 client_id = self.parami("prefix", 0) sc = self.start_stats("{0}.{1}".format(self.spec_reference, phase), # stats spec e.x: testname.load test_params=cfg_params, client_id=client_id, collect_server_stats=collect_server_stats) # For Black box, multi node tests # always use membase-binary if self.is_multi_node: protocol = self.mk_protocol(host=self.input.servers[0].ip, port=self.input.servers[0].port) protocol, host_port, user, pswd = \ self.protocol_parse(protocol, use_direct=use_direct) if not user.strip(): user = self.input.servers[0].rest_username if not pswd.strip(): pswd = self.input.servers[0].rest_password self.log.info("mcsoda - %s %s %s %s" % (protocol, host_port, user, pswd)) self.log.info("mcsoda - cfg: " + str(cfg)) self.log.info("mcsoda - cur: " + str(cur)) cur, start_time, end_time = \ self.mcsoda_run(cfg, cur, protocol, host_port, user, pswd, heartbeat=self.parami("mcsoda_heartbeat", 0), why="load", bucket=self.param("bucket", "default")) self.num_items_loaded = num_items ops = {'tot-sets': cur.get('cur-sets', 0), 'tot-gets': cur.get('cur-gets', 0), 'tot-items': cur.get('cur-items', 0), 'tot-creates': cur.get('cur-creates', 0), 'tot-misses': cur.get('cur-misses', 0), "start-time": start_time, "end-time": end_time} if is_eperf: if self.parami("load_wait_until_drained", 1) == 1: self.wait_until_drained() if self.parami("load_wait_until_repl", PerfDefaults.load_wait_until_repl) == 1: self.wait_until_repl() self.end_stats(sc, ops, "{0}.{1}".format(self.spec_reference, phase)) return ops, start_time, end_time def mcsoda_run(self, cfg, cur, protocol, host_port, user, pswd, stats_collector=None, stores=None, ctl=None, heartbeat=0, why="", bucket="default"): return mcsoda.run(cfg, cur, protocol, host_port, user, pswd, stats_collector=stats_collector, stores=stores, ctl=ctl, heartbeat=heartbeat, why=why, bucket=bucket) def rebalance_nodes(self, num_nodes): """Rebalance cluster(s) if more than 1 node provided""" if len(self.input.servers) == 1 or num_nodes == 1: print "WARNING: running on single node cluster" return else: print "[perf.setUp] rebalancing nodes: num_nodes = {0}".\ format(num_nodes) if self.input.clusters: for cluster in self.input.clusters.values(): status, _ = RebalanceHelper.rebalance_in(cluster, num_nodes - 1, do_shuffle=False) self.assertTrue(status) else: status, _ = RebalanceHelper.rebalance_in(self.input.servers, num_nodes - 1, do_shuffle=False) self.assertTrue(status) @staticmethod def delayed_rebalance_worker(servers, num_nodes, delay_seconds, sc, max_retries=PerfDefaults.reb_max_retries): time.sleep(delay_seconds) gmt_now = time.strftime(PerfDefaults.strftime, time.gmtime()) print "[delayed_rebalance_worker] rebalance started: %s" % gmt_now if not sc: print "[delayed_rebalance_worker] invalid stats collector" return status = False retries = 0 while not status and retries <= max_retries: start_time = time.time() status, nodes = RebalanceHelper.rebalance_in(servers, num_nodes - 1, do_check=(not retries)) end_time = time.time() print "[delayed_rebalance_worker] status: {0}, nodes: {1}, retries: {2}"\ .format(status, nodes, retries) if not status: retries += 1 time.sleep(delay_seconds) sc.reb_stats(start_time, end_time - start_time) def delayed_rebalance(self, num_nodes, delay_seconds=10, max_retries=PerfDefaults.reb_max_retries, sync=False): print "delayed_rebalance" if sync: PerfBase.delayed_rebalance_worker(self.input.servers, num_nodes, delay_seconds, self.sc, max_retries) else: t = threading.Thread(target=PerfBase.delayed_rebalance_worker, args=(self.input.servers, num_nodes, delay_seconds, self.sc, max_retries)) t.daemon = True t.start() @staticmethod def set_auto_compaction(server, parallel_compaction, percent_threshold): rest = RestConnection(server) rest.set_auto_compaction(parallel_compaction, dbFragmentThresholdPercentage=percent_threshold, viewFragmntThresholdPercentage=percent_threshold) @staticmethod def delayed_compaction_worker(servers, parallel_compaction, percent_threshold, delay_seconds): time.sleep(delay_seconds) PerfBase.set_auto_compaction(servers[0], parallel_compaction, percent_threshold) def delayed_compaction(self, parallel_compaction="false", percent_threshold=0.01, delay_seconds=10): t = threading.Thread(target=PerfBase.delayed_compaction_worker, args=(self.input.servers, parallel_compaction, percent_threshold, delay_seconds)) t.daemon = True t.start() def loop(self, num_ops=None, num_items=None, max_items=None, max_creates=None, min_value_size=None, exit_after_creates=0, kind='binary', protocol='binary', clients=1, ratio_misses=0.0, ratio_sets=0.0, ratio_creates=0.0, ratio_deletes=0.0, ratio_hot=0.2, ratio_hot_sets=0.95, ratio_hot_gets=0.95, ratio_expirations=0.0, expiration=None, test_name=None, prefix="", doc_cache=1, use_direct=True, collect_server_stats=True, start_at= -1, report=0, ctl=None, hot_shift=0, is_eperf=False, ratio_queries=0, queries=0, ddoc=None): num_items = num_items or self.num_items_loaded hot_stack_size = \ self.parami('hot_stack_size', PerfDefaults.hot_stack_size) or \ (num_items * ratio_hot) cfg = {'max-items': max_items or num_items, 'max-creates': max_creates or 0, 'max-ops-per-sec': self.parami("mcsoda_max_ops_sec", PerfDefaults.mcsoda_max_ops_sec), 'min-value-size': min_value_size or self.parami("min_value_size", 1024), 'exit-after-creates': exit_after_creates, 'ratio-sets': ratio_sets, 'ratio-misses': ratio_misses, 'ratio-creates': ratio_creates, 'ratio-deletes': ratio_deletes, 'ratio-hot': ratio_hot, 'ratio-hot-sets': ratio_hot_sets, 'ratio-hot-gets': ratio_hot_gets, 'ratio-expirations': ratio_expirations, 'ratio-queries': ratio_queries, 'expiration': expiration or 0, 'threads': clients, 'json': int(kind == 'json'), 'batch': self.parami("batch", PerfDefaults.batch), 'vbuckets': self.vbucket_count, 'doc-cache': doc_cache, 'prefix': prefix, 'queries': queries, 'report': report, 'hot-shift': hot_shift, 'hot-stack': self.parami("hot_stack", PerfDefaults.hot_stack), 'hot-stack-size': hot_stack_size, 'hot-stack-rotate': self.parami("hot_stack_rotate", PerfDefaults.hot_stack_rotate), 'cluster_name': self.param("cluster_name", ""), 'observe': self.param("observe", PerfDefaults.observe), 'obs-backoff': self.paramf('obs_backoff', PerfDefaults.obs_backoff), 'obs-max-backoff': self.paramf('obs_max_backoff', PerfDefaults.obs_max_backoff), 'obs-persist-count': self.parami('obs_persist_count', PerfDefaults.obs_persist_count), 'obs-repl-count': self.parami('obs_repl_count', PerfDefaults.obs_repl_count), 'woq-pattern': self.parami('woq_pattern', PerfDefaults.woq_pattern), 'woq-verbose': self.parami('woq_verbose', PerfDefaults.woq_verbose), 'cor-pattern': self.parami('cor_pattern', PerfDefaults.cor_pattern), 'cor-persist': self.parami('cor_persist', PerfDefaults.cor_persist), 'carbon': self.parami('carbon', PerfDefaults.carbon), 'carbon-server': self.param('carbon_server', PerfDefaults.carbon_server), 'carbon-port': self.parami('carbon_port', PerfDefaults.carbon_port), 'carbon-timeout': self.parami('carbon_timeout', PerfDefaults.carbon_timeout), 'carbon-cache-size': self.parami('carbon_cache_size', PerfDefaults.carbon_cache_size), 'time': self.parami('time', 0)} cfg_params = cfg.copy() cfg_params['test_time'] = time.time() cfg_params['test_name'] = test_name client_id = '' stores = None if is_eperf: client_id = self.parami("prefix", 0) sc = None if self.parami("collect_stats", 1): sc = self.start_stats(self.spec_reference + ".loop", test_params=cfg_params, client_id=client_id, collect_server_stats=collect_server_stats, ddoc=ddoc) self.cur = {'cur-items': num_items} if start_at >= 0: self.cur['cur-gets'] = start_at if num_ops is None: num_ops = num_items if isinstance(num_ops, int): cfg['max-ops'] = num_ops else: # Here, we num_ops looks like "time to run" tuple of... # ('seconds', integer_num_of_seconds_to_run) cfg['time'] = num_ops[1] # For Black box, multi node tests # always use membase-binary if self.is_multi_node: protocol = self.mk_protocol(host=self.input.servers[0].ip, port=self.input.servers[0].port) self.log.info("mcsoda - protocol %s" % protocol) protocol, host_port, user, pswd = \ self.protocol_parse(protocol, use_direct=use_direct) if not user.strip(): user = self.input.servers[0].rest_username if not pswd.strip(): pswd = self.input.servers[0].rest_password self.log.info("mcsoda - %s %s %s %s" % (protocol, host_port, user, pswd)) self.log.info("mcsoda - cfg: " + str(cfg)) self.log.info("mcsoda - cur: " + str(self.cur)) # For query tests always use StoreCouchbase if protocol == "couchbase": stores = [StoreCouchbase()] self.cur, start_time, end_time = \ self.mcsoda_run(cfg, self.cur, protocol, host_port, user, pswd, stats_collector=sc, ctl=ctl, stores=stores, heartbeat=self.parami("mcsoda_heartbeat", 0), why="loop", bucket=self.param("bucket", "default")) ops = {'tot-sets': self.cur.get('cur-sets', 0), 'tot-gets': self.cur.get('cur-gets', 0), 'tot-items': self.cur.get('cur-items', 0), 'tot-creates': self.cur.get('cur-creates', 0), 'tot-misses': self.cur.get('cur-misses', 0), "start-time": start_time, "end-time": end_time} # Wait until there are no active indexing tasks if self.parami('wait_for_indexer', 0): ClusterOperationHelper.wait_for_completion(self.rest, 'indexer') # Wait until there are no active view compaction tasks if self.parami('wait_for_compaction', 0): ClusterOperationHelper.wait_for_completion(self.rest, 'view_compaction') if self.parami("loop_wait_until_drained", PerfDefaults.loop_wait_until_drained): self.wait_until_drained() if self.parami("loop_wait_until_repl", PerfDefaults.loop_wait_until_repl): self.wait_until_repl() if self.parami("collect_stats", 1) and \ not self.parami("reb_no_fg", PerfDefaults.reb_no_fg): self.end_stats(sc, ops, self.spec_reference + ".loop") return ops, start_time, end_time def wait_until_drained(self): print "[perf.drain] draining disk write queue : %s"\ % time.strftime(PerfDefaults.strftime) master = self.input.servers[0] bucket = self.param("bucket", "default") RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_queue_size', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_flusher_todo', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) print "[perf.drain] disk write queue has been drained: %s"\ % time.strftime(PerfDefaults.strftime) return time.time() def wait_until_repl(self): print "[perf.repl] waiting for replication: %s"\ % time.strftime(PerfDefaults.strftime) master = self.input.servers[0] bucket = self.param("bucket", "default") RebalanceHelper.wait_for_stats_on_all(master, bucket, 'vb_replica_queue_size', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_tap_replica_queue_itemondisk', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_tap_rebalance_queue_backfillremaining', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_tap_replica_qlen', 0, fn=RebalanceHelper.wait_for_stats_no_timeout) print "[perf.repl] replication is done: %s"\ % time.strftime(PerfDefaults.strftime) def warmup(self, collect_stats=True, flush_os_cache=False): """ Restart cluster and wait for it to warm up. In current version, affect the master node only. """ if not self.input.servers: print "[warmup error] empty server list" return if collect_stats: client_id = self.parami("prefix", 0) test_params = {'test_time': time.time(), 'test_name': self.id(), 'json': 0} sc = self.start_stats(self.spec_reference + ".warmup", test_params=test_params, client_id=client_id) print "[warmup] preparing to warmup cluster ..." server = self.input.servers[0] shell = RemoteMachineShellConnection(server) start_time = time.time() print "[warmup] stopping couchbase ... ({0}, {1})"\ .format(server.ip, time.strftime(PerfDefaults.strftime)) shell.stop_couchbase() print "[warmup] couchbase stopped ({0}, {1})"\ .format(server.ip, time.strftime(PerfDefaults.strftime)) if flush_os_cache: print "[warmup] flushing os cache ..." shell.flush_os_caches() shell.start_couchbase() print "[warmup] couchbase restarted ({0}, {1})"\ .format(server.ip, time.strftime(PerfDefaults.strftime)) self.wait_until_warmed_up() print "[warmup] warmup finished" end_time = time.time() ops = {'tot-sets': 0, 'tot-gets': 0, 'tot-items': 0, 'tot-creates': 0, 'tot-misses': 0, "start-time": start_time, "end-time": end_time} if collect_stats: self.end_stats(sc, ops, self.spec_reference + ".warmup") def wait_until_warmed_up(self, master=None): if not master: master = self.input.servers[0] bucket = self.param("bucket", "default") fn = RebalanceHelper.wait_for_mc_stats_no_timeout for bucket in self.buckets: RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_warmup_thread', 'complete', fn=fn) def param(self, name, default_value): input = getattr(self, "input", TestInputSingleton.input) return input.test_params.get(name, default_value) def parami(self, name, default_int): return int(self.param(name, default_int)) def paramf(self, name, default_float): return float(self.param(name, default_float)) def params(self, name, default_str): return str(self.param(name, default_str))
def _install_and_upgrade(self, initial_version='1.6.5.3', create_buckets=False, insert_data=False, start_upgraded_first=True, load_ratio=-1, roll_upgrade=False, upgrade_path=[]): node_upgrade_path = [] node_upgrade_path.extend(upgrade_path) #then start them in whatever order you want inserted_keys = [] log = logger.Logger.get_logger() if roll_upgrade: log.info("performing a rolling upgrade") input = TestInputSingleton.input rest_settings = input.membase_settings servers = input.servers save_upgrade_config = False is_amazon = False if input.test_params.get('amazon',False): is_amazon = True # install older build on all nodes for server in servers: remote = RemoteMachineShellConnection(server) rest = RestConnection(server) info = remote.extract_remote_info() older_build = BuildQuery().find_membase_release_build(deliverable_type=info.deliverable_type, os_architecture=info.architecture_type, build_version=initial_version, product='membase-server-enterprise', is_amazon=is_amazon) remote.membase_uninstall() remote.couchbase_uninstall() remote.execute_command('/etc/init.d/membase-server stop') remote.download_build(older_build) #now let's install ? remote.membase_install(older_build) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) rest.init_cluster_port(rest_settings.rest_username, rest_settings.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect() bucket_data = {} master = servers[0] if create_buckets: #let's create buckets #wait for the bucket #bucket port should also be configurable , pass it as the #parameter to this test ? later self._create_default_bucket(master) inserted_keys = self._load_data(master, load_ratio) _create_load_multiple_bucket(self, master, bucket_data, howmany=2) # cluster all the nodes together ClusterOperationHelper.add_all_nodes_or_assert(master, servers, rest_settings, self) rest = RestConnection(master) nodes = rest.node_statuses() otpNodeIds = [] for node in nodes: otpNodeIds.append(node.id) rebalanceStarted = rest.rebalance(otpNodeIds, []) self.assertTrue(rebalanceStarted, "unable to start rebalance on master node {0}".format(master.ip)) log.info('started rebalance operation on master node {0}'.format(master.ip)) rebalanceSucceeded = rest.monitorRebalance() self.assertTrue(rebalanceSucceeded, "rebalance operation for nodes: {0} was not successful".format(otpNodeIds)) if initial_version == "1.7.0" or initial_version == "1.7.1": self._save_config(rest_settings, master) input_version = input.test_params['version'] node_upgrade_path.append(input_version) #if we dont want to do roll_upgrade ? log.info("Upgrade path: {0} -> {1}".format(initial_version, node_upgrade_path)) log.info("List of servers {0}".format(servers)) if not roll_upgrade: for version in node_upgrade_path: if version is not initial_version: log.info("Upgrading to version {0}".format(version)) self._stop_membase_servers(servers) if re.search('1.8', version): save_upgrade_config = True appropriate_build = _get_build(servers[0], version, is_amazon=is_amazon) self.assertTrue(appropriate_build.url, msg="unable to find build {0}".format(version)) for server in servers: remote = RemoteMachineShellConnection(server) remote.download_build(appropriate_build) remote.membase_upgrade(appropriate_build, save_upgrade_config=save_upgrade_config) RestHelper(RestConnection(server)).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) #verify admin_creds still set pools_info = RestConnection(server).get_pools_info() self.assertTrue(pools_info['implementationVersion'], appropriate_build.product_version) if start_upgraded_first: log.info("Starting server {0} post upgrade".format(server)) remote.start_membase() else: remote.stop_membase() remote.disconnect() if not start_upgraded_first: log.info("Starting all servers together") self._start_membase_servers(servers) time.sleep(TIMEOUT_SECS) if version == "1.7.0" or version == "1.7.1": self._save_config(rest_settings, master) if create_buckets: self.assertTrue(BucketOperationHelper.wait_for_bucket_creation('default', RestConnection(master)), msg="bucket 'default' does not exist..") if insert_data: self._verify_data(master, rest, inserted_keys) # rolling upgrade else: version = input.test_params['version'] appropriate_build = _get_build(servers[0], version, is_amazon=is_amazon) self.assertTrue(appropriate_build.url, msg="unable to find build {0}".format(version)) # rebalance node out # remove membase from node # install destination version onto node # rebalance it back into the cluster for server_index in range(len(servers)): server = servers[server_index] master = servers[server_index - 1] log.info("current master is {0}, rolling node is {1}".format(master, server)) rest = RestConnection(master) nodes = rest.node_statuses() allNodes = [] toBeEjectedNodes = [] for node in nodes: allNodes.append(node.id) if "{0}:{1}".format(node.ip, node.port) == "{0}:{1}".format(server.ip, server.port): toBeEjectedNodes.append(node.id) helper = RestHelper(rest) removed = helper.remove_nodes(knownNodes=allNodes, ejectedNodes=toBeEjectedNodes) self.assertTrue(removed, msg="Unable to remove nodes {0}".format(toBeEjectedNodes)) remote = RemoteMachineShellConnection(server) remote.membase_uninstall() remote.couchbase_uninstall() remote.download_build(appropriate_build) remote.membase_install(appropriate_build) RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT) log.info("sleep for 10 seconds to wait for membase-server to start...") time.sleep(TIMEOUT_SECS) rest.init_cluster_port(rest_settings.rest_username, rest_settings.rest_password) rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved) remote.disconnect() #readd this to the cluster ClusterOperationHelper.add_all_nodes_or_assert(master, [server], rest_settings, self) nodes = rest.node_statuses() otpNodeIds = [] for node in nodes: otpNodeIds.append(node.id) rebalanceStarted = rest.rebalance(otpNodeIds, []) self.assertTrue(rebalanceStarted, "unable to start rebalance on master node {0}".format(master.ip)) log.info('started rebalance operation on master node {0}'.format(master.ip)) rebalanceSucceeded = rest.monitorRebalance() self.assertTrue(rebalanceSucceeded, "rebalance operation for nodes: {0} was not successful".format(otpNodeIds)) #ClusterOperationHelper.verify_persistence(servers, self) #TODO: how can i verify that the cluster init config is preserved # verify data on upgraded nodes if create_buckets: self.assertTrue(BucketOperationHelper.wait_for_bucket_creation('default', RestConnection(master)), msg="bucket 'default' does not exist..") if insert_data: self._verify_data(master, rest, inserted_keys) rest = RestConnection(master) buckets = rest.get_buckets() for bucket in buckets: BucketOperationHelper.keys_exist_or_assert(bucket_data[bucket.name]["inserted_keys"], master, bucket.name, self)
def set_up_rest(self, master): self.rest = RestConnection(master) self.rest_helper = RestHelper(self.rest)