def test_update_to_scramsha_auth(self): """ Start with ordinary replication, then switch to use scram_sha_auth Search for success log stmtsS """ old_count = NodeHelper.check_goxdcr_log( self.src_cluster.get_master_node(), "HttpAuthMech=ScramSha for remote cluster reference remote_cluster", timeout=60) self.setup_xdcr() # modify remote cluster ref to use scramsha for remote_cluster in self.src_cluster.get_remote_clusters( ) + self.dest_cluster.get_remote_clusters(): remote_cluster.use_scram_sha_auth() self.sleep(60, "wait before checking the logs for using scram-sha") for node in [self.src_cluster.get_master_node() ] + [self.dest_cluster.get_master_node()]: count = NodeHelper.check_goxdcr_log( node, "HttpAuthMech=ScramSha for remote cluster reference remote_cluster", timeout=60) if count <= old_count: self.fail( "Node {0} does not use SCRAM-SHA authentication".format( node.ip)) else: self.log.info("SCRAM-SHA auth successful on node {0}".format( node.ip)) self.verify_results()
def test_checkpointing_with_full_rollback(self): bucket = self.src_cluster.get_buckets()[0] nodes = self.src_cluster.get_nodes() # Stop Persistence on Node A & Node B for node in nodes: mem_client = MemcachedClientHelper.direct_client(node, bucket) mem_client.stop_persistence() self.src_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() self.sleep(self._checkpoint_interval * 2) self.get_and_validate_latest_checkpoint() # Perform mutations on the bucket self.async_perform_update_delete() self.sleep(self._wait_timeout) # Kill memcached on Node A so that Node B becomes master shell = RemoteMachineShellConnection( self.src_cluster.get_master_node()) shell.kill_memcached() # Start persistence on Node B mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket) mem_client.start_persistence() # Failover Node B failover_task = self.src_cluster.async_failover() failover_task.result() # Wait for Failover & rollback to complete self.sleep(self._wait_timeout * 5) goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0]) \ + '/goxdcr.log*' count1 = NodeHelper.check_goxdcr_log( nodes[0], "Received rollback from DCP stream", goxdcr_log) self.assertGreater(count1, 0, "full rollback not received from DCP as expected") self.log.info("full rollback received from DCP as expected") count2 = NodeHelper.check_goxdcr_log(nodes[0], "Rolled back startSeqno to 0", goxdcr_log) self.assertGreater(count2, 0, "startSeqno not rolled back to 0 as expected") self.log.info("startSeqno rolled back to 0 as expected") shell.disconnect()
def is_ssl_over_memcached(self, master): if not NodeHelper.check_goxdcr_log(master, "Trying to create a ssl over memcached connection"): if NodeHelper.check_goxdcr_log(master, "Get or create ssl over proxy connection"): self.log.error("SSL still uses ns_proxy connection!") return False self.log.info("SSL uses memcached after upgrade!") return True
def is_ssl_over_memcached(self, master): if not NodeHelper.check_goxdcr_log(master, "Try to create a ssl over memcached connection"): if NodeHelper.check_goxdcr_log(master, "Get or create ssl over proxy connection"): self.log.error("SSL still uses ns_proxy connection!") return False self.log.info("SSL uses memcached after upgrade!") return True
def test_checkpointing_with_full_rollback(self): bucket = self.src_cluster.get_buckets()[0] nodes = self.src_cluster.get_nodes() # Stop Persistence on Node A & Node B for node in nodes: mem_client = MemcachedClientHelper.direct_client(node, bucket) mem_client.stop_persistence() self.src_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() self.sleep(self._checkpoint_interval * 2) self.get_and_validate_latest_checkpoint() # Perform mutations on the bucket self.async_perform_update_delete() self.sleep(self._wait_timeout) # Kill memcached on Node A so that Node B becomes master shell = RemoteMachineShellConnection(self.src_cluster.get_master_node()) shell.kill_memcached() # Start persistence on Node B mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket) mem_client.start_persistence() # Failover Node B failover_task = self.src_cluster.async_failover() failover_task.result() # Wait for Failover & rollback to complete self.sleep(self._wait_timeout * 5) goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0]) \ + '/goxdcr.log*' count1 = NodeHelper.check_goxdcr_log( nodes[0], "Received rollback from DCP stream", goxdcr_log) self.assertGreater(count1, 0, "full rollback not received from DCP as expected") self.log.info("full rollback received from DCP as expected") count2 = NodeHelper.check_goxdcr_log( nodes[0], "Rolled back startSeqno to 0", goxdcr_log) self.assertGreater(count2, 0, "startSeqno not rolled back to 0 as expected") self.log.info("startSeqno rolled back to 0 as expected") shell.disconnect()
def test_retry_connections_on_errors_before_restart(self): """ CBQE-3373: Do not restart pipeline as soon as connection errors are detected, backoff and retry 5 times before trying to restart pipeline. """ passed = False # start data load after setting up xdcr load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' # block port 11210 on target to simulate a connection error shell = RemoteMachineShellConnection(self.dest_master) out, err = shell.execute_command( "/sbin/iptables -A INPUT -p tcp --dport" " 11210 -j DROP") shell.log_command_output(out, err) out, err = shell.execute_command("/sbin/iptables -L") shell.log_command_output(out, err) # complete loading for task in load_tasks: task.result() # wait for goxdcr to detect i/o timeout and try repairing self.sleep(self._wait_timeout * 5) # unblock port 11210 so replication can continue out, err = shell.execute_command( "/sbin/iptables -D INPUT -p tcp --dport" " 11210 -j DROP") shell.log_command_output(out, err) out, err = shell.execute_command("/sbin/iptables -L") shell.log_command_output(out, err) shell.disconnect() # check logs for traces of retry attempts for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "Failed to set up connections to target cluster", goxdcr_log) count = count1 + count2 if count > 0: self.log.info('SUCCESS: We tried to repair connections before' ' restarting pipeline') passed = True if not passed: self.fail( "No attempts were made to repair connections on %s before" " restarting pipeline" % self.src_cluster.get_nodes()) self.verify_results()
def _verify_bandwidth_usage(self, node, nw_limit, no_of_nodes, event_time=None, nw_usage="[0-9][0-9]*", end_time=None): #nw_max = (nw_limit * 1024 * 1024) / no_of_nodes if event_time: time_to_compare = self._extract_timestamp(event_time) else: matches, count = NodeHelper.check_goxdcr_log( node, "Success adding replication specification", print_matches=True, timeout=60) # Time when replication was set up if count > 0: time_to_compare = self._extract_timestamp(matches[-1]) else: self.fail("Replication not successful") nw_max = self._extract_bandwith_quota(node) self.sleep(60, 'Waiting for bandwidth usage logs..') # Try 3 times to extract current bandwidth usage from logs iter = 0 while iter < 3: self.sleep(30, 'Waiting for bandwidth usage logs..') valid_count = self._extract_bandwidth_usage( node, time_to_compare, nw_max, nw_usage, end_time) if valid_count == 0 and self._input.param( "replication_type") == "capi" or nw_limit == 0: self.log.info( "Bandwidth Throttler not enabled on replication as expected" ) break if valid_count > 0: break iter += 1 else: self.fail("Bandwidth Throttler not enabled!") # Check if large docs are not getting stuck matches, src_count = NodeHelper.check_goxdcr_log( self.src_master, "The connection is ruined", print_matches=True, timeout=10) if src_count: for item in matches: item_datetime = self._extract_timestamp(item) # Ignore errors that happened before the replication was set up if item_datetime < time_to_compare: continue else: self.fail("Possibly hit MB-31765")
def test_retry_connections_on_errors_before_restart(self): """ CBQE-3373: Do not restart pipeline as soon as connection errors are detected, backoff and retry 5 times before trying to restart pipeline. """ passed = False # start data load after setting up xdcr load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' # block port 11210 on target to simulate a connection error shell = RemoteMachineShellConnection(self.dest_master) out, err = shell.execute_command("/sbin/iptables -A INPUT -p tcp --dport" " 11210 -j DROP") shell.log_command_output(out, err) out, err = shell.execute_command("/sbin/iptables -L") shell.log_command_output(out, err) # complete loading for task in load_tasks: task.result() # wait for goxdcr to detect i/o timeout and try repairing self.sleep(self._wait_timeout*5) # unblock port 11210 so replication can continue out, err = shell.execute_command("/sbin/iptables -D INPUT -p tcp --dport" " 11210 -j DROP") shell.log_command_output(out, err) out, err = shell.execute_command("/sbin/iptables -L") shell.log_command_output(out, err) shell.disconnect() # check logs for traces of retry attempts for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "Failed to set up connections to target cluster", goxdcr_log) count = count1 + count2 if count > 0: self.log.info('SUCCESS: We tried to repair connections before' ' restarting pipeline') passed = True if not passed: self.fail("No attempts were made to repair connections on %s before" " restarting pipeline" % self.src_cluster.get_nodes()) self.verify_results()
def is_goxdcr_migration_successful(self, server): count = NodeHelper.check_goxdcr_log(server, "Starting to migrate xdcr metadata") if count > 0: count = NodeHelper.check_goxdcr_log(server, "Metadata migration completed without errors") self.log.info(count) if count == 1: self.log.info("SUCCESS: Metadata migration completed without errors") return True self.log.error("ERROR: Metadata migration was unsuccessful") return False return True
def test_verify_mb19802_1(self): load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' conn = RemoteMachineShellConnection( self.dest_cluster.get_master_node()) conn.stop_couchbase() for task in load_tasks: task.result() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log(node, "batchGetMeta timed out", goxdcr_log) self.assertEqual( count, 0, "batchGetMeta timed out error message found in " + str(node.ip)) self.log.info( "batchGetMeta timed out error message not found in " + str(node.ip)) conn.start_couchbase() self.sleep(300) self.verify_results()
def _extract_bandwidth_usage(self, node, time_to_compare, nw_max, nw_usage, end_time): valid_count = 0 skip_count = 0 matches, count = NodeHelper.check_goxdcr_log(node, "\\\"bandwidth_usage\\\": " + nw_usage, print_matches=True, timeout=60) for item in matches: item_datetime = self._extract_timestamp(item) # Ignore entries that happened before the replication was set up if item_datetime < time_to_compare: skip_count += 1 continue if end_time: end_datetime = self._extract_timestamp(end_time) if item_datetime > end_datetime: skip_count += 1 continue bandwidth_usage = int(float(((item.split('{"bandwidth_usage": ')[1]).split(' ')[0]).rstrip(','))) if bandwidth_usage > nw_max: self.fail( "Bandwidth usage {0} is higher than Bandwidth limit {1} in {2}".format(bandwidth_usage, nw_max, item)) self.log.info("BANDWIDTH_USAGE ={0}".format(bandwidth_usage)) if nw_usage == "0" and bandwidth_usage != 0: self.fail( "Expecting bandwidth usage to be 0 but it is {0}".format(bandwidth_usage)) valid_count += 1 self.log.info("Stale entries :{0}, Valid entries :{1}".format(skip_count, valid_count)) return valid_count
def mutate_and_checkpoint(self, n=3, skip_validation=False): count = 1 # get vb0 active source node active_src_node = self.get_active_vb0_node(self.src_master) while count <=n: remote_vbuuid, remote_highseqno = self.get_failover_log(self.dest_master) local_vbuuid, local_highseqno = self.get_failover_log(self.src_master) self.log.info("Local failover log: [{0}, {1}]".format(local_vbuuid,local_highseqno)) self.log.info("Remote failover log: [{0}, {1}]".format(remote_vbuuid,remote_highseqno)) self.log.info("################ New mutation:{0} ##################".format(self.key_counter+1)) self.load_one_mutation_into_source_vb0(active_src_node) self.sleep(60) if local_highseqno == "0": # avoid checking very first/empty checkpoint record count += 1 continue stats_count = NodeHelper.check_goxdcr_log( active_src_node, "docs_checked,{0}".format(count), log_name="stats.log", timeout=30) if stats_count > 0: self.log.info("Checkpoint recorded as expected") if not skip_validation: self.log.info("Validating latest checkpoint") self.get_and_validate_latest_checkpoint() else: self.log.info("Checkpointing failed - may not be an error if vb_uuid changed ") return False count += 1 return True
def test_capi_with_malformed_http_resp(self): repl_id = self._start_es_replication(xdcr_params={'workerBatchSize':'2000', 'docBatchSizeKb':'8096', 'targetNozzlePerNode':'64'}) rest_conn = RestConnection(self.src_master) rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true') gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}', xrange(100), start=0, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false') self._wait_for_es_replication_to_catchup() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self.src_master)\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "malformed HTTP response", goxdcr_log) self.assertEqual(count, 0, "malformed HTTP response error message found in " + str(node.ip)) self.log.info("malformed HTTP response error message not found in " + str(node.ip)) self._verify_es_results()
def _verify_bandwidth_usage(self, node, nw_limit=1, no_of_nodes=2, event_time=None, nw_usage="[1-9][0-9]*", end_time=None): goxdcr_log = NodeHelper.get_goxdcr_log_dir(node) + '/goxdcr.log' nw_max = (nw_limit * 1024 * 1024)/no_of_nodes if event_time: time_to_compare = datetime.datetime.strptime(event_time.group(), '%Y-%m-%dT%H:%M:%S') else: matches, _ = NodeHelper.check_goxdcr_log(node, "Success adding replication specification", goxdcr_log, print_matches=True) #Time when replication was set up time_to_compare = self._extract_timestamp(matches[-1]) matches, count = NodeHelper.check_goxdcr_log(node, "\\\"bandwidth_usage\\\": " + nw_usage, goxdcr_log, print_matches=True) if count < 1: self.fail("Bandwidth usage information not found in logs") match_count = 0 skip_count = 0 for item in matches: item_datetime = self._extract_timestamp(item) #Ignore entries that happened before the replication was set up if item_datetime < time_to_compare: skip_count += 1 continue if end_time: end_datetime = datetime.datetime.strptime(end_time.group(), '%Y-%m-%dT%H:%M:%S') if item_datetime > end_datetime: skip_count += 1 continue bandwidth_usage = ((item.split('{"bandwidth_usage": ')[1]).split(' ')[0]).rstrip(',') if int(float(bandwidth_usage)) < nw_max: match_count += 1 continue else: self.fail("Bandwidth usage {0} is higher than Bandwidth limit {1} in {2}".format(bandwidth_usage,nw_max,item)) if match_count + skip_count == count: self.log.info("{0} stale entries skipped".format(skip_count)) if match_count > 0: self.log.info("{0} entries checked - Bandwidth usage always lower than Bandwidth limit as expected". format(match_count)) else: if self._input.param("replication_type") == "capi": self.log.info("Bandwidth Throttler not enabled on replication as expected") else: self.fail("Bandwidth Throttler not enabled on replication")
def _verify_bandwidth_usage(self, node, nw_limit=1, no_of_nodes=2, event_time=None, nw_usage="[1-9][0-9]*", end_time=None): goxdcr_log = NodeHelper.get_goxdcr_log_dir(node) + '/goxdcr.log' nw_max = (nw_limit * 1024 * 1024)/no_of_nodes if event_time: time_to_compare = time.strptime(event_time, '%Y-%m-%dT%H:%M:%S') else: matches, _ = NodeHelper.check_goxdcr_log(node, "Success adding replication specification", goxdcr_log, print_matches=True) time_to_compare_str = matches[-1].split(' ')[0].split('.')[0] time_to_compare = time.strptime(time_to_compare_str, '%Y-%m-%dT%H:%M:%S') matches, count = NodeHelper.check_goxdcr_log(node, "bandwidth_limit=" + str(nw_max) + ", bandwidth_usage=" + nw_usage, goxdcr_log, print_matches=True) match_count = 0 skip_count = 0 for item in matches: items = item.split(' ') item_time = items[0].split('.')[0] item_datetime = time.strptime(item_time, '%Y-%m-%dT%H:%M:%S') if item_datetime < time_to_compare: skip_count += 1 continue if end_time: end_datetime = time.strptime(end_time, '%Y-%m-%dT%H:%M:%S') if item_datetime > end_datetime: skip_count += 1 continue bandwidth_usage = items[-1].split('=')[-1] if int(bandwidth_usage) <= nw_max: match_count += 1 continue else: self.fail("Bandwidth usage higher than Bandwidth limit in {0}".format(item)) if match_count + skip_count == count: self.log.info("{0} stale entries skipped".format(skip_count)) if match_count > 0: self.log.info("{0} entries checked - Bandwidth usage always lower than Bandwidth limit as expected". format(match_count)) else: if self._input.param("replication_type") == "capi": self.log.info("Bandwidth Throttler not enabled on replication as expected") else: self.fail("Bandwidth Throttler not enabled on replication")
def test_update_to_scramsha_auth(self): """ Start with ordinary replication, then switch to use scram_sha_auth Search for success log stmtsS """ old_count = NodeHelper.check_goxdcr_log(self.src_cluster.get_master_node(), "HttpAuthMech=ScramSha for remote cluster reference remote_cluster") self.setup_xdcr() # modify remote cluster ref to use scramsha for remote_cluster in self.src_cluster.get_remote_clusters()+self.dest_cluster.get_remote_clusters(): remote_cluster.use_scram_sha_auth() self.sleep(60, "wait before checking the logs for using scram-sha") for node in [self.src_cluster.get_master_node()]+[self.dest_cluster.get_master_node()]: count = NodeHelper.check_goxdcr_log(node, "HttpAuthMech=ScramSha for remote cluster reference remote_cluster") if count <= old_count: self.fail("Node {0} does not use SCRAM-SHA authentication".format(node.ip)) else: self.log.info("SCRAM-SHA auth successful on node {0}".format(node.ip)) self.verify_results()
def _extract_bandwith_quota(self, node): matches, count = NodeHelper.check_goxdcr_log(node, "bandwidth_usage_quota=" + "[0-9][0-9]*", print_matches=True, timeout=60) bandwidth_quota = int( float((( matches[-1].split('bandwidth_usage_quota=')[1]).rstrip(' ')))) return bandwidth_quota
def incremental_offline_upgrade(self): upgrade_seq = self.input.param("upgrade_seq", "src>dest") self._install(self.servers[:self.src_init + self.dest_init ]) self.create_buckets() self._join_all_clusters() self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('default') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) self.sleep(self.wait_timeout) self._wait_for_replication_to_catchup() nodes_to_upgrade = [] if upgrade_seq == "src>dest": nodes_to_upgrade = copy.copy(self.src_nodes) nodes_to_upgrade.extend(self.dest_nodes) elif upgrade_seq == "src<dest": nodes_to_upgrade = copy.copy(self.dest_nodes) nodes_to_upgrade.extend(self.src_nodes) elif upgrade_seq == "src><dest": min_cluster = min(len(self.src_nodes), len(self.dest_nodes)) for i in xrange(min_cluster): nodes_to_upgrade.append(self.src_nodes[i]) nodes_to_upgrade.append(self.dest_nodes[i]) for _seq, node in enumerate(nodes_to_upgrade): self._offline_upgrade([node]) self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') itemPrefix = "loadThree" + _seq * 'a' gen_create3 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('default') itemPrefix = "loadFour" + _seq * 'a' gen_create4 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create4, 'create', exp=0) self._wait_for_replication_to_catchup() self.merge_all_buckets() self.verify_results() self.sleep(self.wait_timeout * 5, "Let clusters work for some time") if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) self.assertEqual(count, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found in " + str(node.ip))
def get_checkpoint_call_history(self, node): chkpts, count = NodeHelper.check_goxdcr_log(node, "num_checkpoints", log_name="stats.log", print_matches=True, timeout=10) if count > 0: total_successful_chkpts = int((chkpts[-1].split('num_checkpoints,')[1]).rstrip('},')) else: total_successful_chkpts = 0 self.log.info(total_successful_chkpts) chkpts, count = NodeHelper.check_goxdcr_log(node, "num_failedckpts", log_name="stats.log", print_matches=True, timeout=10) if count > 0: total_failed_chkpts = int((chkpts[-1].split('num_failedckpts,')[1]).rstrip('},')) else: total_failed_chkpts = 0 return total_successful_chkpts + total_failed_chkpts, total_successful_chkpts, total_failed_chkpts
def test_verify_mb19697(self): self.setup_xdcr_and_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.src_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() self._wait_for_replication_to_catchup() gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000) load_tasks = self.src_cluster.async_load_all_buckets_from_generator( gen) self.src_cluster.rebalance_out() for task in load_tasks: task.result() self._wait_for_replication_to_catchup() self.src_cluster.rebalance_in() gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000) load_tasks = self.src_cluster.async_load_all_buckets_from_generator( gen) self.src_cluster.failover_and_rebalance_master() for task in load_tasks: task.result() self._wait_for_replication_to_catchup() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "counter .+ goes backward, maybe due to the pipeline is restarted", goxdcr_log) self.assertEqual( count, 0, "counter goes backward, maybe due to the pipeline is restarted " "error message found in " + str(node.ip)) self.log.info( "counter goes backward, maybe due to the pipeline is restarted " "error message not found in " + str(node.ip)) self.sleep(300) self.verify_results()
def mutate_and_check_error404(self, n=1): # get vb0 active source node active_src_node = self.get_active_vb0_node(self.src_master) num_404_errors_before_load = NodeHelper.check_goxdcr_log( active_src_node, "ERRO GOXDCR.CheckpointMgr: GetRemoteMemcachedConnection Operation failed after max retries", timeout=30) self.sleep(60) self.log.info("################ New mutation:{0} ##################".format(self.key_counter+1)) self.load_one_mutation_into_source_vb0(active_src_node) self.sleep(5) num_404_errors_after_load = NodeHelper.check_goxdcr_log( active_src_node, "ERRO GOXDCR.CheckpointMgr: GetRemoteMemcachedConnection Operation failed after max retries", timeout=30) if num_404_errors_after_load > num_404_errors_before_load: self.log.info("Topology change verified after dest failover/rebalance out") return True else: self.log.info("404 errors on source node before last load : {0}, after last node: {1}". format(num_404_errors_before_load, num_404_errors_after_load)) self.log.error("Topology change NOT recorded at source following dest failover or rebalance!")
def get_pre_replicate_call_history(self, node): prerep_calls, count = NodeHelper.check_goxdcr_log(node, "POST /_goxdcr/_pre_replicate", log_name="http_access.log", timeout=10, print_matches=True) if count > 0: total_successful_prereps = 0 for call in prerep_calls: call_datetime = self._extract_timestamp(call) # Ignore calls that happened before the test started if call_datetime < self.time_test_started: continue total_successful_prereps += 1 return total_successful_prereps
def mutate_and_checkpoint(self, n=3, skip_validation=False): count = 1 # get vb0 active source node stats_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/stats.log' active_src_node = self.get_active_vb0_node(self.src_master) while count <= n: remote_vbuuid, remote_highseqno = self.get_failover_log( self.dest_master) local_vbuuid, local_highseqno = self.get_failover_log( self.src_master) self.log.info("Local failover log: [{0}, {1}]".format( local_vbuuid, local_highseqno)) self.log.info("Remote failover log: [{0}, {1}]".format( remote_vbuuid, remote_highseqno)) self.log.info( "################ New mutation:{0} ##################".format( self.key_counter + 1)) self.load_one_mutation_into_source_vb0(active_src_node) self.sleep(60) if local_highseqno == "0": # avoid checking very first/empty checkpoint record count += 1 continue end_time = time.time() + self._wait_timeout while time.time() < end_time: stats_count = NodeHelper.check_goxdcr_log( active_src_node, "docs_checked,{0}".format(count), stats_log) if stats_count > 0: self.log.info("Checkpoint recorded as expected") if not skip_validation: self.log.info("Validating latest checkpoint") self.get_and_validate_latest_checkpoint() break else: self.sleep( 20, "Checkpoint not recorded yet, will check after 20s") else: self.log.info( "Checkpointing failed - may not be an error if vb_uuid changed " ) return False count += 1 return True
def test_scramsha(self): """ Creates a new bi-xdcr replication with scram-sha Make sure to pass use-scramsha=True from command line """ self.setup_xdcr() self.sleep(60, "wait before checking logs") for node in [self.src_cluster.get_master_node()]+[self.dest_cluster.get_master_node()]: count = NodeHelper.check_goxdcr_log(node, "HttpAuthMech=ScramSha for remote cluster reference remote_cluster") if count <= 0: self.fail("Node {0} does not use SCRAM-SHA authentication".format(node.ip)) else: self.log.info("SCRAM-SHA auth successful on node {0}".format(node.ip)) self.verify_results()
def test_verify_mb19697(self): self.setup_xdcr_and_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.src_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() self._wait_for_replication_to_catchup() gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000) load_tasks = self.src_cluster.async_load_all_buckets_from_generator(gen) self.src_cluster.rebalance_out() for task in load_tasks: task.result() self._wait_for_replication_to_catchup() self.src_cluster.rebalance_in() gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000) load_tasks = self.src_cluster.async_load_all_buckets_from_generator(gen) self.src_cluster.failover_and_rebalance_master() for task in load_tasks: task.result() self._wait_for_replication_to_catchup() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "counter .+ goes backward, maybe due to the pipeline is restarted", goxdcr_log) self.assertEqual(count, 0, "counter goes backward, maybe due to the pipeline is restarted " "error message found in " + str(node.ip)) self.log.info("counter goes backward, maybe due to the pipeline is restarted " "error message not found in " + str(node.ip)) self.sleep(300) self.verify_results()
def test_verify_mb19181(self): load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0]) \ + '/goxdcr.log*' self.dest_cluster.failover_and_rebalance_master() for task in load_tasks: task.result() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "Can't move update state from", goxdcr_log) self.assertEqual(count, 0, "Can't move update state from - error message found in " + str(node.ip)) self.log.info("Can't move update state from - error message not found in " + str(node.ip)) self.verify_results()
def test_verify_mb19802_2(self): load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.dest_cluster.failover_and_rebalance_master() for task in load_tasks: task.result() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "batchGetMeta received fatal error and had to abort", goxdcr_log) self.assertEqual(count, 0, "batchGetMeta timed out error message found in " + str(node.ip)) self.log.info("batchGetMeta error message not found in " + str(node.ip)) self.sleep(300) self.verify_results()
def test_verify_mb19802_1(self): load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' conn = RemoteMachineShellConnection(self.dest_cluster.get_master_node()) conn.stop_couchbase() for task in load_tasks: task.result() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "batchGetMeta timed out", goxdcr_log) self.assertEqual(count, 0, "batchGetMeta timed out error message found in " + str(node.ip)) self.log.info("batchGetMeta timed out error message not found in " + str(node.ip)) conn.start_couchbase() self.verify_results()
def test_verify_mb19802_2(self): load_tasks = self.setup_xdcr_async_load() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.dest_cluster.failover_and_rebalance_master() for task in load_tasks: task.result() for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log(node, "batchGetMeta timed out", goxdcr_log) self.assertEqual( count, 0, "batchGetMeta timed out error message found in " + str(node.ip)) self.log.info( "batchGetMeta timed out error message not found in " + str(node.ip)) self.verify_results()
def test_scramsha(self): """ Creates a new bi-xdcr replication with scram-sha Make sure to pass use-scramsha=True from command line """ self.setup_xdcr() self.sleep(60, "wait before checking logs") for node in [self.src_cluster.get_master_node() ] + [self.dest_cluster.get_master_node()]: count = NodeHelper.check_goxdcr_log( node, "HttpAuthMech=ScramSha for remote cluster reference remote_cluster", timeout=60) if count <= 0: self.fail( "Node {0} does not use SCRAM-SHA authentication".format( node.ip)) else: self.log.info("SCRAM-SHA auth successful on node {0}".format( node.ip)) self.verify_results()
def mutate_and_checkpoint(self, n=3, skip_validation=False): count = 1 # get vb0 active source node stats_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/stats.log' active_src_node = self.get_active_vb0_node(self.src_master) while count <=n: remote_vbuuid, remote_highseqno = self.get_failover_log(self.dest_master) local_vbuuid, local_highseqno = self.get_failover_log(self.src_master) self.log.info("Local failover log: [{0}, {1}]".format(local_vbuuid,local_highseqno)) self.log.info("Remote failover log: [{0}, {1}]".format(remote_vbuuid,remote_highseqno)) self.log.info("################ New mutation:{0} ##################".format(self.key_counter+1)) self.load_one_mutation_into_source_vb0(active_src_node) self.sleep(60) if local_highseqno == "0": # avoid checking very first/empty checkpoint record count += 1 continue end_time = time.time() + self._wait_timeout while time.time() < end_time: stats_count = NodeHelper.check_goxdcr_log( active_src_node, "docs_checked,{0}".format(count), stats_log) if stats_count > 0: self.log.info("Checkpoint recorded as expected") if not skip_validation: self.log.info("Validating latest checkpoint") self.get_and_validate_latest_checkpoint() break else: self.sleep(20, "Checkpoint not recorded yet, will check after 20s") else: self.log.info("Checkpointing failed - may not be an error if vb_uuid changed ") return False count += 1 return True
def test_capi_with_malformed_http_resp(self): self.setup_xdcr() rest_conn = RestConnection(self.src_master) rest_conn.set_xdcr_param('default', 'default', 'workerBatchSize', 2000) rest_conn.set_xdcr_param('default', 'default', 'docBatchSizeKb', 8096) rest_conn.set_xdcr_param('default', 'default', 'targetNozzlePerNode', 64) self.src_cluster.pause_all_replications() gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}', xrange(100), start=0, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() self._wait_for_replication_to_catchup() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self.src_master) \ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log(node, "malformed HTTP response", goxdcr_log) self.assertEqual( count, 0, "malformed HTTP response error message found in " + str(node.ip)) self.log.info( "malformed HTTP response error message not found in " + str(node.ip)) self._verify_es_results()
def online_cluster_upgrade(self): self._install(self.servers[:self.src_init + self.dest_init]) prev_initial_version = self.initial_version self.initial_version = self.upgrade_versions[0] self._install(self.servers[self.src_init + self.dest_init:]) self.create_buckets() self._join_all_clusters() if float(prev_initial_version[:2]) < 3.0: self.pause_xdcr_cluster = None bucket_default = self.src_cluster.get_bucket_by_name('default') bucket_sasl = self.src_cluster.get_bucket_by_name('sasl_bucket_1') bucket_standard = self.dest_cluster.get_bucket_by_name('standard_bucket_1') bucket_sasl_2 = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, end=self.num_items) gen_delete2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=int((self.num_items) * (float)(100 - self._perc_del) / 100), end=self.num_items) gen_update2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=0, end=int(self.num_items * (float)(self._perc_upd) / 100)) self._load_bucket(bucket_default, self.src_master, self.gen_create, 'create', exp=0) self._load_bucket(bucket_sasl, self.src_master, self.gen_create, 'create', exp=0) if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() self._online_upgrade(self.src_nodes, self.servers[self.src_init + self.dest_init:]) self.src_master = self.servers[self.src_init + self.dest_init] if not self.is_goxdcr_migration_successful(self.src_master): self.fail("C1: Metadata migration failed after old nodes were removed") self._load_bucket(bucket_standard, self.dest_master, self.gen_create, 'create', exp=0) self._load_bucket(bucket_default, self.src_master, self.gen_update, 'create', exp=self._expires) self._load_bucket(bucket_sasl, self.src_master, self.gen_update, 'create', exp=self._expires) self._install(self.src_nodes) self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.src_nodes, False) self._load_bucket(bucket_sasl_2, self.dest_master, gen_create2, 'create', exp=0) self.src_master = self.servers[0] self.log.info("###### Upgrading C1: completed ######") self._install(self.servers[self.src_init + self.dest_init:]) self.sleep(60) self._online_upgrade(self.dest_nodes, self.servers[self.src_init + self.dest_init:]) self.dest_master = self.servers[self.src_init + self.dest_init] if not self.is_goxdcr_migration_successful(self.dest_master): self.fail("C2: Metadata migration failed after old nodes were removed") self._install(self.dest_nodes) self.sleep(60) if float(self.initial_version[:2]) >= 3.0 and self._demand_encryption: if not self.is_ssl_over_memcached(self.src_master): self.fail("C1: After old nodes were replaced, C1 still uses " "proxy connection to C2 which is >= 3.0") if not self.is_ssl_over_memcached(self.dest_master): self.fail("C2: After old nodes were replaced, C2 still uses " "proxy connection to C1 which is >= 3.0") self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.dest_nodes, False) self.dest_master = self.servers[self.src_init] self.log.info("###### Upgrading C2: completed ######") if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() self._load_bucket(bucket_default, self.src_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_sasl, self.src_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_standard, self.dest_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_sasl_2, self.dest_master, gen_delete2, 'delete', exp=0) self._wait_for_replication_to_catchup() self._post_upgrade_ops() self.sleep(120) self.verify_results() self.max_verify = None if self.ddocs_src: for bucket_name in self.buckets_on_src: bucket = self.src_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master) if self.ddocs_dest: for bucket_name in self.buckets_on_dest: bucket = self.dest_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master) if float(self.upgrade_versions[0][:3]) == 4.6: self.log.info("##### Testing LWW as we are upgrading to 4.6 #####") src_conn = RestConnection(self.src_master) dest_conn = RestConnection(self.dest_master) src_conn.delete_bucket(bucket='default') dest_conn.delete_bucket(bucket='default') src_conn.create_bucket(bucket='lww', ramQuotaMB=100, authType='none', saslPassword='', replicaNumber=1, proxyPort=STANDARD_BUCKET_PORT + 1, bucketType='membase', replica_index=1, threadsNumber=3, flushEnabled=1, lww=True) dest_conn.create_bucket(bucket='lww', ramQuotaMB=100, authType='none', saslPassword='', replicaNumber=1, proxyPort=STANDARD_BUCKET_PORT + 1, bucketType='membase', replica_index=1, threadsNumber=3, flushEnabled=1, lww=True) self.assertTrue(src_conn.is_lww_enabled(bucket='lww'), "LWW not enabled on source bucket") self.log.info("LWW enabled on source bucket as expected") self.assertTrue(dest_conn.is_lww_enabled(bucket='lww'), "LWW not enabled on dest bucket") self.log.info("LWW enabled on dest bucket as expected") if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip))
def test_backward_compatibility(self): self.c1_version = self.initial_version self.c2_version = self.upgrade_versions[0] # install older version on C1 self._install(self.servers[:self.src_init]) #install latest version on C2 self.initial_version = self.c2_version self._install(self.servers[self.src_init:]) self.initial_version = self.c1_version self.create_buckets() # workaround for MB-15761 if float(self.initial_version[:2]) < 3.0 and self._demand_encryption: rest = RestConnection(self.dest_master) rest.set_internalSetting('certUseSha1',"true") rest.regenerate_cluster_certificate() self._join_all_clusters() if float(self.c1_version[:2]) >= 3.0: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('default') self._operations() self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) if float(self.c1_version[:2]) >= 3.0: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() self._wait_for_replication_to_catchup() if float(self.c1_version[:2]) > 2.5: for remote_cluster in self.src_cluster.get_remote_clusters(): remote_cluster.modify() for remote_cluster in self.dest_cluster.get_remote_clusters(): remote_cluster.modify() self.sleep(30) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('default') self._load_bucket(bucket, self.src_master, gen_create2, 'create', exp=0) self.merge_all_buckets() self.sleep(60) self._post_upgrade_ops() self.sleep(60) self.verify_results() if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) self.assertEqual(count, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found in " + str(node.ip))
def online_cluster_upgrade(self): if self.bucket_type == "ephemeral" and float(self.initial_version[:3]) < 5.0: self.log.info("Ephemeral buckets not available in version " + str(self.initial_version)) self.skip_this_version = True return if self.initial_version[:3] >= self.upgrade_versions[0][:3]: self.log.info("Initial version greater than upgrade version - not supported") self.skip_this_version = True return self._install(self.servers[:self.src_init + self.dest_init]) prev_initial_version = self.initial_version self.initial_version = self.upgrade_versions[0] self._install(self.servers[self.src_init + self.dest_init:]) self.create_buckets() self._join_all_clusters() if float(prev_initial_version[:2]) < 3.0: self.pause_xdcr_cluster = None bucket_default = self.src_cluster.get_bucket_by_name('default') bucket_sasl = self.src_cluster.get_bucket_by_name('sasl_bucket_1') bucket_standard = self.dest_cluster.get_bucket_by_name('standard_bucket_1') bucket_sasl_2 = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, end=self.num_items) gen_delete2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=int((self.num_items) * (float)(100 - self._perc_del) / 100), end=self.num_items) gen_update2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=0, end=int(self.num_items * (float)(self._perc_upd) / 100)) self._load_bucket(bucket_default, self.src_master, self.gen_create, 'create', exp=0) self._load_bucket(bucket_sasl, self.src_master, self.gen_create, 'create', exp=0) if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() self._online_upgrade(self.src_nodes, self.servers[self.src_init + self.dest_init:]) self.src_master = self.servers[self.src_init + self.dest_init] if not self.is_goxdcr_migration_successful(self.src_master): self.fail("C1: Metadata migration failed after old nodes were removed") if self.upgrade_versions[0][:3] >= 5.0: # Add built-in user to C1 testuser = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'password': '******'}] RbacBase().create_user_source(testuser, 'builtin', self.src_master) self.sleep(10) # Assign user to role role_list = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'roles': 'admin'}] RbacBase().add_user_role(role_list, RestConnection(self.src_master), 'builtin') self.sleep(10) self._load_bucket(bucket_standard, self.dest_master, self.gen_create, 'create', exp=0) self._load_bucket(bucket_default, self.src_master, self.gen_update, 'create', exp=self._expires) self._load_bucket(bucket_sasl, self.src_master, self.gen_update, 'create', exp=self._expires) self._install(self.src_nodes) self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.src_nodes, False) self._load_bucket(bucket_sasl_2, self.dest_master, gen_create2, 'create', exp=0) self.src_master = self.servers[0] self.log.info("###### Upgrading C1: completed ######") self._install(self.servers[self.src_init + self.dest_init:]) self.sleep(60) self._online_upgrade(self.dest_nodes, self.servers[self.src_init + self.dest_init:]) self.dest_master = self.servers[self.src_init + self.dest_init] if not self.is_goxdcr_migration_successful(self.dest_master): self.fail("C2: Metadata migration failed after old nodes were removed") self._install(self.dest_nodes) self.sleep(60) if float(self.initial_version[:2]) >= 3.0 and self._demand_encryption: if not self.is_ssl_over_memcached(self.src_master): self.fail("C1: After old nodes were replaced, C1 still uses " "proxy connection to C2 which is >= 3.0") if not self.is_ssl_over_memcached(self.dest_master): self.fail("C2: After old nodes were replaced, C2 still uses " "proxy connection to C1 which is >= 3.0") self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.dest_nodes, False) self.dest_master = self.servers[self.src_init] if self.upgrade_versions[0][:3] >= 5.0: # Add built-in user to C2 testuser = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'password': '******'}] RbacBase().create_user_source(testuser, 'builtin', self.dest_master) self.sleep(10) # Assign user to role role_list = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'roles': 'admin'}] RbacBase().add_user_role(role_list, RestConnection(self.dest_master), 'builtin') self.sleep(10) self.log.info("###### Upgrading C2: completed ######") if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() self._load_bucket(bucket_default, self.src_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_sasl, self.src_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_standard, self.dest_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_sasl_2, self.dest_master, gen_delete2, 'delete', exp=0) self._wait_for_replication_to_catchup(timeout=600) self._post_upgrade_ops() self.sleep(120) self.verify_results() self.max_verify = None if self.ddocs_src: for bucket_name in self.buckets_on_src: bucket = self.src_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master) if self.ddocs_dest: for bucket_name in self.buckets_on_dest: bucket = self.dest_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master) if float(self.upgrade_versions[0][:3]) == 4.6: self.log.info("##### Testing LWW as we are upgrading to 4.6 #####") src_conn = RestConnection(self.src_master) dest_conn = RestConnection(self.dest_master) src_conn.delete_bucket(bucket='default') dest_conn.delete_bucket(bucket='default') src_conn.create_bucket(bucket='lww', ramQuotaMB=100, authType='none', saslPassword='', replicaNumber=1, proxyPort=STANDARD_BUCKET_PORT + 1, bucketType='membase', replica_index=1, threadsNumber=3, flushEnabled=1, lww=True) dest_conn.create_bucket(bucket='lww', ramQuotaMB=100, authType='none', saslPassword='', replicaNumber=1, proxyPort=STANDARD_BUCKET_PORT + 1, bucketType='membase', replica_index=1, threadsNumber=3, flushEnabled=1, lww=True) self.assertTrue(src_conn.is_lww_enabled(bucket='lww'), "LWW not enabled on source bucket") self.log.info("LWW enabled on source bucket as expected") self.assertTrue(dest_conn.is_lww_enabled(bucket='lww'), "LWW not enabled on dest bucket") self.log.info("LWW enabled on dest bucket as expected") if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) count4 = NodeHelper.check_goxdcr_log( node, "received error response from setMeta client. Repairing connection. response status=EINVAL", goxdcr_log) count5 = NodeHelper.check_goxdcr_log( node, "GOGC in new global setting is 0, which is not a valid value and can only have come from " "upgrade. Changed it to 100 instead.", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip)) self.assertEqual(count4, 0, "Disconnect errors found in " + str(node.ip)) self.assertEqual(count5, 0, "GOGC reset to 0 during upgrade in " + str(node.ip))
def online_cluster_upgrade(self): self._install(self.servers[:self.src_init + self.dest_init]) prev_initial_version = self.initial_version self.initial_version = self.upgrade_versions[0] self._install(self.servers[self.src_init + self.dest_init:]) self.create_buckets() self._join_all_clusters() if float(prev_initial_version[:2]) < 3.0: self.pause_xdcr_cluster = None bucket_default = self.src_cluster.get_bucket_by_name('default') bucket_sasl = self.src_cluster.get_bucket_by_name('sasl_bucket_1') bucket_standard = self.dest_cluster.get_bucket_by_name('standard_bucket_1') bucket_sasl_2 = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, end=self.num_items) gen_delete2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=int((self.num_items) * (float)(100 - self._perc_del) / 100), end=self.num_items) gen_update2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=0, end=int(self.num_items * (float)(self._perc_upd) / 100)) self._load_bucket(bucket_default, self.src_master, self.gen_create, 'create', exp=0) self._load_bucket(bucket_sasl, self.src_master, self.gen_create, 'create', exp=0) if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() self._online_upgrade(self.src_nodes, self.servers[self.src_init + self.dest_init:]) self.src_master = self.servers[self.src_init + self.dest_init] if not self.is_goxdcr_migration_successful(self.src_master): self.fail("C1: Metadata migration failed after old nodes were removed") self._load_bucket(bucket_standard, self.dest_master, self.gen_create, 'create', exp=0) self._load_bucket(bucket_default, self.src_master, self.gen_update, 'create', exp=self._expires) self._load_bucket(bucket_sasl, self.src_master, self.gen_update, 'create', exp=self._expires) self._install(self.src_nodes) self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.src_nodes, False) self._load_bucket(bucket_sasl_2, self.dest_master, gen_create2, 'create', exp=0) self.src_master = self.servers[0] self.log.info("###### Upgrading C1: completed ######") self._install(self.servers[self.src_init + self.dest_init:]) self.sleep(60) self._online_upgrade(self.dest_nodes, self.servers[self.src_init + self.dest_init:]) self.dest_master = self.servers[self.src_init + self.dest_init] if not self.is_goxdcr_migration_successful(self.dest_master): self.fail("C2: Metadata migration failed after old nodes were removed") self._install(self.dest_nodes) self.sleep(60) if float(self.initial_version[:2]) >= 3.0 and self._demand_encryption: if not self.is_ssl_over_memcached(self.src_master): self.fail("C1: After old nodes were replaced, C1 still uses " "proxy connection to C2 which is >= 3.0") if not self.is_ssl_over_memcached(self.dest_master): self.fail("C2: After old nodes were replaced, C2 still uses " "proxy connection to C1 which is >= 3.0") self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.dest_nodes, False) self.dest_master = self.servers[self.src_init] self.log.info("###### Upgrading C2: completed ######") if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() self._load_bucket(bucket_default, self.src_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_sasl, self.src_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_standard, self.dest_master, self.gen_delete, 'delete', exp=0) self._load_bucket(bucket_sasl_2, self.dest_master, gen_delete2, 'delete', exp=0) self._wait_for_replication_to_catchup() self._post_upgrade_ops() self.sleep(120) self.verify_results() self.max_verify = None if self.ddocs_src: for bucket_name in self.buckets_on_src: bucket = self.src_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master) if self.ddocs_dest: for bucket_name in self.buckets_on_dest: bucket = self.dest_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master) if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) self.assertEqual(count, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found in " + str(node.ip))
def incremental_offline_upgrade(self): if self.bucket_type == "ephemeral" and float(self.initial_version[:3]) < 5.0: self.log.info("Ephemeral buckets not available in version " + str(self.initial_version)) self.skip_this_version = True return if self.initial_version[:3] >= self.upgrade_versions[0][:3]: self.log.info("Initial version greater than upgrade version - not supported") self.skip_this_version = True return upgrade_seq = self.input.param("upgrade_seq", "src>dest") self._install(self.servers[:self.src_init + self.dest_init ]) self.create_buckets() self._join_all_clusters() self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('default') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) self.sleep(self.wait_timeout) self._wait_for_replication_to_catchup() nodes_to_upgrade = [] if upgrade_seq == "src>dest": nodes_to_upgrade = copy.copy(self.src_nodes) nodes_to_upgrade.extend(self.dest_nodes) elif upgrade_seq == "src<dest": nodes_to_upgrade = copy.copy(self.dest_nodes) nodes_to_upgrade.extend(self.src_nodes) elif upgrade_seq == "src><dest": min_cluster = min(len(self.src_nodes), len(self.dest_nodes)) for i in xrange(min_cluster): nodes_to_upgrade.append(self.src_nodes[i]) nodes_to_upgrade.append(self.dest_nodes[i]) for _seq, node in enumerate(nodes_to_upgrade): self._offline_upgrade([node]) self.sleep(60) if self.upgrade_versions[0][:3] >= 5.0: # Add built-in user to C1 testuser = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'password': '******'}] RbacBase().create_user_source(testuser, 'builtin', self.src_master) self.sleep(10) # Assign user to role role_list = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'roles': 'admin'}] RbacBase().add_user_role(role_list, RestConnection(self.src_master), 'builtin') self.sleep(10) # Add built-in user to C2 testuser = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'password': '******'}] RbacBase().create_user_source(testuser, 'builtin', self.dest_master) self.sleep(10) # Assign user to role role_list = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'roles': 'admin'}] RbacBase().add_user_role(role_list, RestConnection(self.dest_master), 'builtin') self.sleep(10) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') itemPrefix = "loadThree" + _seq * 'a' gen_create3 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('default') itemPrefix = "loadFour" + _seq * 'a' gen_create4 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create4, 'create', exp=0) self._wait_for_replication_to_catchup(timeout=600) self.merge_all_buckets() self.verify_results() self.sleep(self.wait_timeout * 5, "Let clusters work for some time") if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) count4 = NodeHelper.check_goxdcr_log( node, "received error response from setMeta client. Repairing connection. response status=EINVAL", goxdcr_log) count5 = NodeHelper.check_goxdcr_log( node, "GOGC in new global setting is 0, which is not a valid value and can only have come from " "upgrade. Changed it to 100 instead.", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip)) self.assertEqual(count4, 0, "Disconnect errors found in " + str(node.ip)) self.assertEqual(count5, 0, "GOGC reset to 0 during upgrade in " + str(node.ip))
def test_backward_compatibility(self): if self.bucket_type == "ephemeral" and float(self.initial_version[:3]) < 5.0: self.log.info("Ephemeral buckets not available in version " + str(self.initial_version)) self.skip_this_version = True return self.c1_version = self.initial_version self.c2_version = self.upgrade_versions[0] if self.c1_version[:3] >= self.c2_version[:3]: self.log.info("Initial version greater than upgrade version - not supported") self.skip_this_version = True return # install older version on C1 self._install(self.servers[:self.src_init]) #install latest version on C2 self.initial_version = self.c2_version self._install(self.servers[self.src_init:]) self.initial_version = self.c1_version self.create_buckets() # workaround for MB-15761 if float(self.initial_version[:2]) < 3.0 and self._demand_encryption: rest = RestConnection(self.dest_master) rest.set_internalSetting('certUseSha1',"true") rest.regenerate_cluster_certificate() self._join_all_clusters() if float(self.c1_version[:2]) >= 3.0: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('default') self._operations() self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) if float(self.c1_version[:2]) >= 3.0: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() self._wait_for_replication_to_catchup() if float(self.c1_version[:2]) > 2.5: for remote_cluster in self.src_cluster.get_remote_clusters(): remote_cluster.modify() for remote_cluster in self.dest_cluster.get_remote_clusters(): remote_cluster.modify() self.sleep(30) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('default') self._load_bucket(bucket, self.src_master, gen_create2, 'create', exp=0) self.merge_all_buckets() self.sleep(60) self._post_upgrade_ops() self.sleep(60) self.verify_results() if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) count4 = NodeHelper.check_goxdcr_log( node, "received error response from setMeta client. Repairing connection. response status=EINVAL", goxdcr_log) count5 = NodeHelper.check_goxdcr_log( node, "GOGC in new global setting is 0, which is not a valid value and can only have come from " "upgrade. Changed it to 100 instead.", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip)) self.assertEqual(count4, 0, "Disconnect errors found in " + str(node.ip)) self.assertEqual(count5, 0, "GOGC reset to 0 during upgrade in " + str(node.ip))
def offline_cluster_upgrade(self): # install on src and dest nodes self._install(self.servers[:self.src_init + self.dest_init ]) upgrade_nodes = self.input.param('upgrade_nodes', "src").split(";") self.create_buckets() self._join_all_clusters() if float(self.initial_version[:2]) < 3.0: self.pause_xdcr_cluster = None bucket = self.src_cluster.get_bucket_by_name('default') self._operations() self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) self._wait_for_replication_to_catchup() if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() nodes_to_upgrade = [] if "src" in upgrade_nodes: nodes_to_upgrade += self.src_nodes if "dest" in upgrade_nodes: nodes_to_upgrade += self.dest_nodes self._offline_upgrade(nodes_to_upgrade) self.log.info("######### Upgrade of C1 and C2 completed ##########") if not self.is_goxdcr_migration_successful(self.src_master): self.fail("C1: Metadata migration failed after offline upgrade of C1") if not self.is_goxdcr_migration_successful(self.dest_master): self.fail("C2: Metadata migration failed after offline upgrade of C2") if self._use_encryption_after_upgrade and "src" in upgrade_nodes and "dest" in upgrade_nodes and self.upgrade_versions[0] >= "2.5.0": if "src" in self._use_encryption_after_upgrade: for remote_cluster in self.src_cluster.get_remote_clusters(): remote_cluster._modify() if "dest" in self._use_encryption_after_upgrade: for remote_cluster in self.dest_cluster.get_remote_clusters(): remote_cluster._modify() self.sleep(60) if self._demand_encryption or self._use_encryption_after_upgrade: if not self.is_ssl_over_memcached(self.src_master): self.fail("C1: After old nodes were replaced, C1 still uses " "ns_proxy connection to C2 which is >= 3.0") if not self.is_ssl_over_memcached(self.dest_master): self.fail("C2: After old nodes were replaced, C2 still uses " "ns_proxy connection to C1 which is >= 3.0") bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0) if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() bucket = self.src_cluster.get_bucket_by_name('default') gen_create5 = BlobGenerator('loadFive', 'loadFive', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create5, 'create', exp=0) self.merge_all_buckets() self.sleep(60) self._post_upgrade_ops() self.sleep(60) self.verify_results() self.max_verify = None if self.ddocs_src: for bucket_name in self.buckets_on_src: bucket = self.src_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master) if self.ddocs_dest: for bucket_name in self.buckets_on_dest: bucket = self.dest_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master) if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) self.assertEqual(count, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found in " + str(node.ip))
def offline_cluster_upgrade(self): # install on src and dest nodes self._install(self.servers[:self.src_init + self.dest_init ]) upgrade_nodes = self.input.param('upgrade_nodes', "src").split(";") self.create_buckets() self._join_all_clusters() if float(self.initial_version[:2]) < 3.0: self.pause_xdcr_cluster = None bucket = self.src_cluster.get_bucket_by_name('default') self._operations() self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) self._wait_for_replication_to_catchup() if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() nodes_to_upgrade = [] if "src" in upgrade_nodes: nodes_to_upgrade += self.src_nodes if "dest" in upgrade_nodes: nodes_to_upgrade += self.dest_nodes self._offline_upgrade(nodes_to_upgrade) self.log.info("######### Upgrade of C1 and C2 completed ##########") if not self.is_goxdcr_migration_successful(self.src_master): self.fail("C1: Metadata migration failed after offline upgrade of C1") if not self.is_goxdcr_migration_successful(self.dest_master): self.fail("C2: Metadata migration failed after offline upgrade of C2") if self._use_encryption_after_upgrade and "src" in upgrade_nodes and "dest" in upgrade_nodes and self.upgrade_versions[0] >= "2.5.0": if "src" in self._use_encryption_after_upgrade: for remote_cluster in self.src_cluster.get_remote_clusters(): remote_cluster._modify() if "dest" in self._use_encryption_after_upgrade: for remote_cluster in self.dest_cluster.get_remote_clusters(): remote_cluster._modify() self.sleep(60) if self._demand_encryption or self._use_encryption_after_upgrade: if not self.is_ssl_over_memcached(self.src_master): self.fail("C1: After old nodes were replaced, C1 still uses " "ns_proxy connection to C2 which is >= 3.0") if not self.is_ssl_over_memcached(self.dest_master): self.fail("C2: After old nodes were replaced, C2 still uses " "ns_proxy connection to C1 which is >= 3.0") bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0) if self.pause_xdcr_cluster: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() bucket = self.src_cluster.get_bucket_by_name('default') gen_create5 = BlobGenerator('loadFive', 'loadFive', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create5, 'create', exp=0) self.merge_all_buckets() self.sleep(60) self._post_upgrade_ops() self.sleep(60) self.verify_results() self.max_verify = None if self.ddocs_src: for bucket_name in self.buckets_on_src: bucket = self.src_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master) if self.ddocs_dest: for bucket_name in self.buckets_on_dest: bucket = self.dest_cluster.get_bucket_by_name(bucket_name) expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()]) self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master) if float(self.upgrade_versions[0][:3]) == 4.6: self.log.info("##### Testing LWW as we are upgrading to 4.6 #####") if "src" in upgrade_nodes: src_conn = RestConnection(self.src_master) src_conn.delete_bucket(bucket='default') src_conn.create_bucket(bucket='lww', ramQuotaMB=100, authType='none', saslPassword='', replicaNumber=1, proxyPort=STANDARD_BUCKET_PORT + 1, bucketType='membase', replica_index=1, threadsNumber=3, flushEnabled=1, lww=True) self.assertTrue(src_conn.is_lww_enabled(bucket='lww'), "LWW not enabled on source bucket") self.log.info("LWW enabled on source bucket as expected") if "dest" in upgrade_nodes: dest_conn = RestConnection(self.dest_master) dest_conn.delete_bucket(bucket='default') dest_conn.create_bucket(bucket='lww', ramQuotaMB=100, authType='none', saslPassword='', replicaNumber=1, proxyPort=STANDARD_BUCKET_PORT + 1, bucketType='membase', replica_index=1, threadsNumber=3, flushEnabled=1, lww=True) self.assertTrue(dest_conn.is_lww_enabled(bucket='lww'), "LWW not enabled on dest bucket") self.log.info("LWW enabled on dest bucket as expected") if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip))
def test_rollback(self): bucket = self.src_cluster.get_buckets()[0] nodes = self.src_cluster.get_nodes() # Stop Persistence on Node A & Node B for node in nodes: mem_client = MemcachedClientHelper.direct_client(node, bucket) mem_client.stop_persistence() goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.setup_xdcr() self.src_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() # Perform mutations on the bucket self.async_perform_update_delete() rest1 = RestConnection(self.src_cluster.get_master_node()) rest2 = RestConnection(self.dest_cluster.get_master_node()) # Fetch count of docs in src and dest cluster _count1 = rest1.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1] _count2 = rest2.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1] self.log.info("Before rollback src cluster count = {0} dest cluster count = {1}".format(_count1, _count2)) # Kill memcached on Node A so that Node B becomes master shell = RemoteMachineShellConnection(self.src_cluster.get_master_node()) shell.kill_memcached() # Start persistence on Node B mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket) mem_client.start_persistence() # Failover Node B failover_task = self.src_cluster.async_failover() failover_task.result() # Wait for Failover & rollback to complete self.sleep(60) # Fetch count of docs in src and dest cluster _count1 = rest1.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1] _count2 = rest2.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1] self.log.info("After rollback src cluster count = {0} dest cluster count = {1}".format(_count1, _count2)) self.assertTrue(self.src_cluster.wait_for_outbound_mutations(), "Mutations in source cluster not replicated to target after rollback") self.log.info("Mutations in source cluster replicated to target after rollback") count = NodeHelper.check_goxdcr_log( nodes[0], "Received rollback from DCP stream", goxdcr_log) self.assertGreater(count, 0, "rollback did not happen as expected") self.log.info("rollback happened as expected")
def incremental_offline_upgrade(self): upgrade_seq = self.input.param("upgrade_seq", "src>dest") self._install(self.servers[:self.src_init + self.dest_init ]) self.create_buckets() self._join_all_clusters() self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('default') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) self.sleep(self.wait_timeout) self._wait_for_replication_to_catchup() nodes_to_upgrade = [] if upgrade_seq == "src>dest": nodes_to_upgrade = copy.copy(self.src_nodes) nodes_to_upgrade.extend(self.dest_nodes) elif upgrade_seq == "src<dest": nodes_to_upgrade = copy.copy(self.dest_nodes) nodes_to_upgrade.extend(self.src_nodes) elif upgrade_seq == "src><dest": min_cluster = min(len(self.src_nodes), len(self.dest_nodes)) for i in xrange(min_cluster): nodes_to_upgrade.append(self.src_nodes[i]) nodes_to_upgrade.append(self.dest_nodes[i]) for _seq, node in enumerate(nodes_to_upgrade): self._offline_upgrade([node]) self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') itemPrefix = "loadThree" + _seq * 'a' gen_create3 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('default') itemPrefix = "loadFour" + _seq * 'a' gen_create4 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create4, 'create', exp=0) self._wait_for_replication_to_catchup() self.merge_all_buckets() self.verify_results() self.sleep(self.wait_timeout * 5, "Let clusters work for some time") if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip))
def test_rollback(self): goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' self.setup_xdcr() self.src_cluster.pause_all_replications() gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.src_cluster.resume_all_replications() bucket = self.src_cluster.get_buckets()[0] nodes = self.src_cluster.get_nodes() # Stop Persistence on Node A & Node B for node in nodes: mem_client = MemcachedClientHelper.direct_client(node, bucket) mem_client.stop_persistence() # Perform mutations on the bucket self.async_perform_update_delete() rest1 = RestConnection(self.src_cluster.get_master_node()) rest2 = RestConnection(self.dest_cluster.get_master_node()) # Fetch count of docs in src and dest cluster _count1 = rest1.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] _count2 = rest2.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] self.log.info( "Before rollback src cluster count = {0} dest cluster count = {1}". format(_count1, _count2)) # Kill memcached on Node A so that Node B becomes master shell = RemoteMachineShellConnection( self.src_cluster.get_master_node()) shell.kill_memcached() # Start persistence on Node B mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket) mem_client.start_persistence() # Failover Node B failover_task = self.src_cluster.async_failover() failover_task.result() # Wait for Failover & rollback to complete self.sleep(60) # Fetch count of docs in src and dest cluster _count1 = rest1.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] _count2 = rest2.fetch_bucket_stats( bucket=bucket.name)["op"]["samples"]["curr_items"][-1] self.log.info( "After rollback src cluster count = {0} dest cluster count = {1}". format(_count1, _count2)) self.assertTrue( self.src_cluster.wait_for_outbound_mutations(), "Mutations in source cluster not replicated to target after rollback" ) self.log.info( "Mutations in source cluster replicated to target after rollback") count = NodeHelper.check_goxdcr_log( nodes[0], "Received rollback from DCP stream", goxdcr_log) self.assertGreater(count, 0, "rollback did not happen as expected") self.log.info("rollback happened as expected")
def test_backward_compatibility(self): self.c1_version = self.initial_version self.c2_version = self.upgrade_versions[0] # install older version on C1 self._install(self.servers[:self.src_init]) #install latest version on C2 self.initial_version = self.c2_version self._install(self.servers[self.src_init:]) self.initial_version = self.c1_version self.create_buckets() # workaround for MB-15761 if float(self.initial_version[:2]) < 3.0 and self._demand_encryption: rest = RestConnection(self.dest_master) rest.set_internalSetting('certUseSha1',"true") rest.regenerate_cluster_certificate() self._join_all_clusters() if float(self.c1_version[:2]) >= 3.0: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.pause_all_replications() self.sleep(60) bucket = self.src_cluster.get_bucket_by_name('default') self._operations() self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1') gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0) if float(self.c1_version[:2]) >= 3.0: for cluster in self.get_cb_clusters(): for remote_cluster in cluster.get_remote_clusters(): remote_cluster.resume_all_replications() self._wait_for_replication_to_catchup() if float(self.c1_version[:2]) > 2.5: for remote_cluster in self.src_cluster.get_remote_clusters(): remote_cluster.modify() for remote_cluster in self.dest_cluster.get_remote_clusters(): remote_cluster.modify() self.sleep(30) bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1') gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items) self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0) bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1') gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items) self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0) bucket = self.src_cluster.get_bucket_by_name('default') self._load_bucket(bucket, self.src_master, gen_create2, 'create', exp=0) self.merge_all_buckets() self.sleep(60) self._post_upgrade_ops() self.sleep(60) self.verify_results() if float(self.initial_version[:3]) == 3.1 and float(self.upgrade_versions[0][:3]) == 4.1: goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\ + '/goxdcr.log*' for node in self.src_cluster.get_nodes(): count1 = NodeHelper.check_goxdcr_log( node, "Received error response from memcached in target cluster", goxdcr_log) count2 = NodeHelper.check_goxdcr_log( node, "EINVAL", goxdcr_log) count3 = NodeHelper.check_goxdcr_log( node, "Failed to repair connections to target cluster", goxdcr_log) if count1 > 0 or count2 > 0: self.assertEqual(count3, 0, "Failed to repair connections to target cluster " "error message found in " + str(node.ip)) self.log.info("Failed to repair connections to target cluster " "error message not found as expected in " + str(node.ip))
def _verify_bandwidth_usage(self, node, nw_limit=1, no_of_nodes=2, event_time=None, nw_usage="[1-9][0-9]*", end_time=None): goxdcr_log = NodeHelper.get_goxdcr_log_dir(node) + '/goxdcr.log' nw_max = (nw_limit * 1024 * 1024) / no_of_nodes if event_time: time_to_compare = time.strptime(event_time, '%Y-%m-%dT%H:%M:%S') else: matches, _ = NodeHelper.check_goxdcr_log( node, "Success adding replication specification", goxdcr_log, print_matches=True) time_to_compare_str = matches[-1].split(' ')[0].split('.')[0] time_to_compare = time.strptime(time_to_compare_str, '%Y-%m-%dT%H:%M:%S') matches, count = NodeHelper.check_goxdcr_log( node, "bandwidth_limit=" + str(nw_max) + ", bandwidth_usage=" + nw_usage, goxdcr_log, print_matches=True) match_count = 0 skip_count = 0 for item in matches: items = item.split(' ') item_time = items[0].split('.')[0] item_datetime = time.strptime(item_time, '%Y-%m-%dT%H:%M:%S') if item_datetime < time_to_compare: skip_count += 1 continue if end_time: end_datetime = time.strptime(end_time, '%Y-%m-%dT%H:%M:%S') if item_datetime > end_datetime: skip_count += 1 continue bandwidth_usage = items[-1].split('=')[-1] if int(bandwidth_usage) <= nw_max: match_count += 1 continue else: self.fail("Bandwidth usage higher than Bandwidth limit in {0}". format(item)) if match_count + skip_count == count: self.log.info("{0} stale entries skipped".format(skip_count)) if match_count > 0: self.log.info( "{0} entries checked - Bandwidth usage always lower than Bandwidth limit as expected" .format(match_count)) else: if self._input.param("replication_type") == "capi": self.log.info( "Bandwidth Throttler not enabled on replication as expected" ) else: self.fail("Bandwidth Throttler not enabled on replication")