def clear_host_from_docker(self): self.docker_manager = DockerManager(self.config, self.ssh) containers_count = self.docker_manager.print_and_terminate_containers( True) if containers_count == 0: tiden_assert('Tests requare no dockers on host') self.docker_manager.remove_containers( image_name='openmicroscopy/apacheds') self.docker_manager.remove_images( name='openmicroscopy/apacheds:latest')
def assert_no_errors_in_utility_output(self, tx_check=False, reverse=False): if tx_check: self.cu.control_utility('--tx', reverse=reverse) tiden_assert('Error' not in self.cu.latest_utility_output, 'Error found in control.sh utility output') self.cu.control_utility('--cache', 'idle_verify', reverse=reverse) tiden_assert( self.no_idle_verify_conflicts_msg in self.cu.latest_utility_output, 'No conflicts have been found')
def generate_dr_topology(self, cluster_count, server_node_per_cluster, client_node_per_cluster): nodes_count_per_cluster = len(self.tiden.config['environment']['server_hosts']) * \ self.tiden.config['environment']['servers_per_host'] tiden_assert( server_node_per_cluster + client_node_per_cluster <= nodes_count_per_cluster, '(server_node_per_cluster + client_node_per_cluster) <= (server_hosts * servers_per_host)' ) clusters = [] for counter in range(1, cluster_count + 1): cluster = Cluster(counter, self.tiden.config) cluster.add_nodes(server_node_per_cluster, 'server') cluster.add_nodes(client_node_per_cluster, 'client') clusters.append(cluster) return clusters
def zookeeper_fail_test(self, scenario, expecting_broken_cluster=False): node_connection = self.get_server_connections() try: with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, skip_consistency_check=True): util_sleep_for_a_while(10, msg='Wait until load started') self.zookeeper_fail_scenario(scenario) self.su.snapshot_utility('SNAPSHOT', '-type=full') self.ignite.kill_node(2) util_sleep_for_a_while(60, msg='Wait after zookeeper issue') self.ignite.start_node(2) for node_id in node_connection.keys(): tiden_assert(self.ignite.check_node_is_alive(node_id), "Node {} is expected to be alive".format(node_id)) if expecting_broken_cluster: tiden_assert( False, 'split up all zookeeper host expected to broke cluster') except Exception as e: if expecting_broken_cluster: util_sleep_for_a_while(60, msg='Wait all node segmented') for node_id in node_connection.keys(): tiden_assert( not self.ignite.check_node_is_alive(node_id), "Node {} is expected to be dead".format(node_id)) else: raise e
def server_check_cluster_behaviour(self, node_segmented_group): """ Func got expected segmented group: 1. check that all nodes in this group are dead 2. start all these nodes 3. wait rebalance timeout 4. check there was no data corruption: - call idle_verify - try to do some loading - call idle_verify again and check transactions :param node_segmented_group: group of nodes expected to be dead """ # check all nodes are dead for node_id in node_segmented_group.keys(): tiden_assert( not self.ignite.check_node_is_alive(node_id), "Node {} is expected to be dead".format( node_segmented_group.get(node_id))) second_hosts_node_ids = [ int(node) for node in node_segmented_group.keys() ] # start all nodes and wait for rebalance completed self.ignite.start_nodes(*second_hosts_node_ids, force=True) util_sleep_for_a_while(90, msg='Wait rebalance timeout') # check idle verify does not return any errors self.assert_no_errors_in_utility_output() # check with some loading with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, skip_consistency_check=True): util_sleep_for_a_while(15, msg='Little load') util_sleep_for_a_while(5, msg='Wait after load') self.assert_no_errors_in_utility_output(tx_check=True)
def calculate_checksum_and_validate(self, last_loaded_key, iteration=0): diff_keys = {} client_config = Ignite.config_builder.get_config( 'client', config_set_name='cluster_1_node_without_dr') with PiClient(self.clusters[0].grid, client_config, new_instance=True) as piclient: checksum_master = create_distributed_checksum_operation().evaluate( ) if COLLECT_KEYS: cache_names = piclient.get_ignite().cacheNames().toArray() diff_keys['master'] = {} for cache in cache_names: diff_keys['master'][cache] = [] iterator = piclient.get_ignite().cache(cache).iterator() while iterator.hasNext(): diff_keys['master'][cache].append( iterator.next().getKey()) diff_keys['master'][cache].sort() client_config = Ignite.config_builder.get_config( 'client', config_set_name='cluster_2_node_without_dr') with PiClient(self.clusters[1].grid, client_config, new_instance=True) as piclient: checksum_slave = create_distributed_checksum_operation().evaluate() if COLLECT_KEYS: cache_names = piclient.get_ignite().cacheNames().toArray() diff_keys['replica'] = {} for cache in cache_names: diff_keys['replica'][cache] = [] iterator = piclient.get_ignite().cache(cache).iterator() while iterator.hasNext(): diff_keys['replica'][cache].append( iterator.next().getKey()) diff_keys['replica'][cache].sort() if checksum_master != checksum_slave: print(f"Checksum master\n{checksum_master}") print(f"Checksum slave\n{checksum_slave}") # if COLLECT_KEYS: # pprint(diff_keys, width=240) comparison = ''.join(difflib.Differ().compare( checksum_master.splitlines(True), checksum_slave.splitlines(True))) if iteration > 3: print(comparison) # log_print('Unable to get checksum equality on both clusters after FST', color='blue') # # return tiden_assert( False, 'Unable to get checksum equality on both clusters after FST' ) if iteration > 2: if DO_FST_ON_DIFFERENCE: caches_with_diff = [] for line in comparison.split('\n'): val = None if line.startswith('-'): m = search('Cache.*\'(.*)\'.rows', line) if m: val = m.group(1) if val: caches_with_diff.append(val) log_print(f'Difference detected in some caches.', color='red') log_print(f'Starting clear() caches: {caches_with_diff}.', color='red') client_config = Ignite.config_builder.get_config( 'client', config_set_name='cluster_2_node_without_dr') with PiClient(self.clusters[1].grid, client_config, new_instance=True) as piclient: ignite = piclient.get_ignite() for cache in caches_with_diff: ignite.cache(cache).clear() # cache_names = piclient.get_ignite().cacheNames().toArray() # # print(list( # create_checksum_operation(cache_name, 1, last_loaded_key).evaluate() for cache_name in # cache_names)) util_sleep_for_a_while(RECHECK_CHECKSUM_TIMEOUT) log_print( f'Starting full state transfer on caches: {caches_with_diff}.', color='red') client_config = Ignite.config_builder.get_config( 'client', config_set_name='cluster_1_node_without_dr') with PiClient(self.clusters[0].grid, client_config, new_instance=True) as piclient: ignite = piclient.get_ignite() try: futs = [] for cache in caches_with_diff: # TODO https://ggsystems.atlassian.net/browse/GG-22669 ignite.cache(cache) futs.append( ignite.plugin( 'GridGain').dr().stateTransfer( cache, bytes([2]))) for fut in futs: fut.get() except Exception as e: log_print( 'Exception caught on on FST\n{}'.format(e), color='red') log_print('Going to restart replication with FST', color='yellow') futs = [] for cache in caches_with_diff: ignite.cache(cache) ignite.plugin( "GridGain").dr().startReplication(cache) futs.append( ignite.plugin( 'GridGain').dr().stateTransfer( cache, bytes([2]))) for fut in futs: fut.get() util_sleep_for_a_while(FST_TIMEOUT) log_print( f'Going to collect checksum again after timeout - {RECHECK_CHECKSUM_TIMEOUT} seconds', color='red') util_sleep_for_a_while(RECHECK_CHECKSUM_TIMEOUT) return self.calculate_checksum_and_validate( last_loaded_key, iteration + 1) return checksum_master, checksum_slave
def verify_cluster(self, cluster_to_verify_id, nodes_before, last_loaded_key=None): client_config = Ignite.config_builder.get_config( 'client', config_set_name='cluster_1_node_without_dr') servers = 0 ignite = self.clusters[cluster_to_verify_id].grid for i in range(3): for res in ignite.last_topology_snapshot(): if res['servers'] > servers: servers = res['servers'] else: break util_sleep_for_a_while(5) if nodes_before != servers: log_print( f"There are missing nodes on cluster: Nodes in cluster: {servers} expecting {nodes_before}", color='yellow') self.verify_no_meaning_errors() log_print("Wait for topology messages again.", color='yellow') for node_id in ignite.get_all_default_nodes(): ignite.update_started_node_status(node_id) log_print("Missing nodes case confirmed. Trying to restart node.", color='red') current_cluster_nodes = ignite.get_nodes_num('server') if nodes_before != current_cluster_nodes: log_print(f"Current nodes in cluster {current_cluster_nodes}") nodes_to_start = [] for node_id in ignite.get_alive_default_nodes(): # assert that node is not dead otherwise kill/restart again if not ignite.check_node_status(node_id): log_print("Restarting node %s" % node_id, color='yellow') nodes_to_start.append(node_id) log_print(f"Going to restart nodes: {nodes_to_start}", color='debug') for node_id in nodes_to_start: ignite.start_node(node_id, skip_nodes_check=True, check_only_servers=True) current_cluster_nodes = ignite.get_nodes_num('server') if nodes_before != current_cluster_nodes: log_print( f"Current amount of nodes in cluster: {current_cluster_nodes}, expecting {nodes_before}", color='debug') for node_id in ignite.get_alive_default_nodes(): self.util_get_threads_from_jstack( ignite, node_id, "FAILED") assert False, "Failed to restart node" ignite.cu.control_utility('--activate') activate_failed = False log_print('Check that there is no Error in activate logs', color='yellow') if 'Error' in ignite.cu.latest_utility_output: activate_failed = True log_print('Failed!', color='red') sleep(5) ignite.cu.control_utility('--baseline') self.verify_no_meaning_errors() log_print('Check that there is no Error in control.sh --baseline logs', color='yellow') if 'Error' in ignite.cu.latest_utility_output: log_print('Failed! Second try after sleep 60 seconds', color='red') sleep(60) ignite.cu.control_utility('--baseline') if 'Error' in ignite.cu.latest_utility_output or activate_failed: log_print('Cluster looks hang.') log_print('Check that there is no AssertionError in logs', color='yellow') self.verify_no_meaning_errors() if last_loaded_key: try: new_last_key = last_loaded_key - int( random.uniform(0, 1) * LOAD_DATA_SIZE) log_print( f'Trying to remove data from survivor caches ({new_last_key}, {last_loaded_key})', color='yellow') PiClientIgniteUtils.remove_data( ignite, client_config, start_key=new_last_key, end_key=last_loaded_key, check_clients=False, ) last_loaded_key = new_last_key except Exception: for node_id in ignite.get_alive_default_nodes(): self.util_get_threads_from_jstack(ignite, node_id, "FAILED") assert False, "Unable to connect client" finally: self.verify_no_meaning_errors() util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT) checksum_master, checksum_slave = self.calculate_checksum_and_validate( last_loaded_key) tiden_assert(checksum_master == checksum_slave, 'Hash sum master and slave should be equal') return last_loaded_key
def run_stress_restarts(self, cluster_id_to_restart, iterations, nodes_to_restart, time_to_sleep_range): client_config = self.preconfigure_cluster_0() with PiClient(self.clusters[0].grid, client_config, jvm_options=['-ea']) as piclient: ignite = piclient.get_ignite() self.start_dynamic_caches_with_node_filter(client_config) last_loaded_key = START_DATA_SIZE PiClientIgniteUtils.load_data_with_putall(self.clusters[0].grid, client_config, end_key=last_loaded_key, jvm_options=['-ea'], check_clients=False) util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT) nodes_before = 6 last_loaded_key += 1 for i in range(0, iterations): log_print(f'Current iteration {i + 1} from {iterations}', color='debug') sleep_for_time = random.uniform(time_to_sleep_range[0], time_to_sleep_range[1]) log_print( f'In this run we are going to sleep for {sleep_for_time} seconds after each node restart', color='green') log_print('Trying to load data into created/existing caches', color='yellow') self.start_dynamic_caches_with_node_filter(client_config) PiClientIgniteUtils.load_data_with_putall( self.clusters[0].grid, client_config, start_key=last_loaded_key, end_key=last_loaded_key + LOAD_DATA_SIZE, jvm_options=['-ea'], check_clients=False) last_loaded_key += LOAD_DATA_SIZE self.increment_atomic(ignite) log_print("Round restart") for node_id in nodes_to_restart: self.clusters[cluster_id_to_restart].grid.kill_node( node_id) self.clusters[cluster_id_to_restart].grid.start_node( node_id, skip_topology_check=True) sleep(sleep_for_time) log_print("Wait for topology messages") for node_id in nodes_to_restart: self.clusters[ cluster_id_to_restart].grid.update_started_node_status( node_id) util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT) last_loaded_key = self.verify_cluster(0, nodes_before, last_loaded_key) util_sleep_for_a_while(DR_STABILIZATION_TIMEOUT) checksum_master1, checksum_slave1 = self.calculate_checksum_and_validate( last_loaded_key) tiden_assert(checksum_master1 == checksum_slave1, 'Hash sum master and slave not match') self.put_data(self.clusters[1], 1, 'cluster_2_node_without_dr') util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT) checksum_master2, checksum_slave2 = self.calculate_checksum_and_validate( last_loaded_key) tiden_assert(checksum_master2 == checksum_slave2, 'Hash sum master and slave not match')
def smoke(self): cu_master = ControlUtility(self.clusters[0].grid) cu_replica = ControlUtility(self.clusters[1].grid) master = self.clusters[0].grid replica = self.clusters[1].grid master.jmx.start_utility() replica.jmx.start_utility() known_issues = [] with PiClient(master, self.master_client_config) as piclient_master: caches = list(piclient_master.get_ignite().cacheNames().toArray()) # start stopped caches for cache in caches: if master.jmx.dr_status(cache, node_id=1)['DrStatus'] != 'Active': master.jmx.dr_start(cache, node_id=1) if replica.jmx.dr_status(cache, node_id=1)['DrStatus'] != 'Active': replica.jmx.dr_start(cache, node_id=1) try: # topology all_topology_commands = '--dr', 'topology', '--sender-hubs', '--receiver-hubs', '--data-nodes', '--other-nodes' actual_topology_master = cu_master.control_utility(*all_topology_commands).dr().parse() actual_topology_replica = cu_replica.control_utility(*all_topology_commands).dr().parse() expected_topology = self.get_topology_data() tiden_assert_equal(expected_topology[1], actual_topology_master, 'master topology') tiden_assert_equal(expected_topology[2], actual_topology_replica, 'replica topology') except DRControlUtilityException: message = 'known issue: GG-24679 - DR: control.sh --dr topology throw exception during topology change' known_issues.append(message) log_print(message, color='red') try: # state actual_state_master = cu_master.control_utility('--dr', 'state').dr().parse() actual_state_replica = cu_replica.control_utility('--dr', 'state').dr().parse() tiden_assert_equal({'dc_id': '1', 'receiver_caches': '120'}, actual_state_master, 'master state') tiden_assert_equal({'dc_id': '2', 'receiver_caches': '120'}, actual_state_replica, 'replica state') # sender groups actual_state_verb_master = cu_master.control_utility('--dr', 'state', '--verbose').dr().parse() actual_state_verb_replica = cu_replica.control_utility('--dr', 'state', '--verbose').dr().parse() tiden_assert_equal({'dc_id': '1', 'sender_groups': 'dr, group1'}, actual_state_verb_master, 'master state verbose') tiden_assert_equal({'dc_id': '2', 'sender_groups': 'dr, group1'}, actual_state_verb_replica, 'replica state verbose') except AssertionError: message = 'Known issue: GG-24460 - DR: control.sh --dr state is throw exceptions on clients senders/receivers' known_issues.append(message) log_print(message, color='red') # node try: for cluster in self.clusters: for node_id, node in cluster.grid.nodes.items(): if node['status'] == NodeStatus.STARTED and node_id < 100: actual_node = ControlUtility(cluster.grid).control_utility('--dr', 'node', node['id'], '--config', '--metrics').dr().parse() tiden_assert_equal({'addresses': node['host'], 'mode': 'Server, Baseline node', 'streamer_pool_size': '16', 'thread_pool_size': '4', 'dc_id': str(cluster.id)}, actual_node, f'cluster {cluster.id} node {node_id} info') except DRControlUtilityException: message = 'Known issue: GG-24463 - DR: control.sh --dr node failed to execute on client sender/receiver node' known_issues.append(message) log_print(message, color='red') # cache config actual_cache_metrics = cu_master.control_utility('--dr', 'cache', '.+', '--config').dr().parse() tiden_assert_equal(120, len(actual_cache_metrics['sender_configuration']), 'configs sender count') tiden_assert_equal(120, len(actual_cache_metrics['receiver_configuration']), 'configs receiver count') actual_cache_metrics = cu_replica.control_utility('--dr', 'cache', '.+', '--config').dr().parse() tiden_assert_equal(120, len(actual_cache_metrics['sender_configuration']), 'configs sender count') tiden_assert_equal(120, len(actual_cache_metrics['receiver_configuration']), 'configs receiver count') try: # cache metrics actual_cache_metrics = cu_master.control_utility('--dr', 'cache', '.+', '--metrics').dr().parse() tiden_assert_equal(120, len(actual_cache_metrics.get('sender_metrics')), 'metrics sender count') tiden_assert_equal(120, len(actual_cache_metrics.get('receiver_metrics')), 'metrics receiver count') actual_cache_metrics = cu_replica.control_utility('--dr', 'cache', '.+', '--metrics').dr().parse() tiden_assert_equal(120, len(actual_cache_metrics.get('sender_metrics')), 'metrics sender count') tiden_assert_equal(120, len(actual_cache_metrics.get('receiver_metrics')), 'metrics receiver count') except: message = 'Known issue: GG-24725 - DR: control.sh --dr cache .+ --metrics not show metrics before replication' known_issues.append(message) log_print(message, color='red') started_cache_counter = 101 with PiClient(master, self.master_client_config) as piclient_master: with PiClient(replica, self.replica_client_config, new_instance=True) as piclient_replica: for cluster, piclient, cu, compare_piclient, transfer, cluster_id in [ (master, piclient_master, cu_master, piclient_replica, 'fst', 1), (replica, piclient_replica, cu_replica, piclient_master, 'action_fst', 2) ]: # action stop actual_stop = cu.control_utility('--dr', 'cache', '.+', '--action', 'stop', '--yes').dr().parse() tiden_assert_equal('120', actual_stop['caches_affected'][0], 'affected caches') for cache_name in caches: # all should be stopped cache_status = cluster.jmx.dr_status(cache_name, node_id=1) tiden_assert_equal({'DrStatus': 'Stopped [reason=USER_REQUEST]'}, cache_status, f'{cache_name} dr status') # put data for FST create_put_all_operation(cache_name, started_cache_counter, started_cache_counter + 100, 100, key_type='java.lang.Long', value_type='java.lang.Long', gateway=piclient.get_gateway()).evaluate() started_cache_counter = started_cache_counter + 101 # action start actual_start = cu.control_utility('--dr', 'cache', '.+', '--action', 'start', '--yes').dr().parse() tiden_assert_equal('120', actual_start['caches_affected'][0], 'affected caches') for cache_name in caches: # all should be active cache_status = cluster.jmx.dr_status(cache_name, node_id=1) tiden_assert_equal({'DrStatus': 'Active'}, cache_status, f'{cache_name} dr status') if transfer == 'fst': # fst actual_fst = cu.control_utility('--dr', 'full-state-transfer', '--yes').dr().parse() tiden_assert_equal(120, len(actual_fst['transferred_caches'].split(',')), 'transferred caches') else: # action fst actual_fst = cu.control_utility('--dr', 'cache', '.+', '--action', 'full-state-transfer', '--yes').dr().parse() tiden_assert_equal('120', actual_fst['caches_affected'][0], 'transferred caches with action') try: # data should be consistent after fst self._wait_for_same_caches_size(piclient, compare_piclient, how_long=60) except TidenException: if getattr(self.clusters[0], 'client1', False): message = 'Known issue GG-24760 - DR: control.sh --dr full-state-transfer does ...' known_issues.append(message) log_print(message, color='red') else: raise self.assert_checksums() # pause actual_pause = cu.control_utility('--dr', 'pause', str(cluster_id), '--yes').dr().parse() tiden_assert_equal(str(cluster_id), actual_pause['dc_id'], 'pause data center id') try: for cache_name in caches: cache_status = cluster.jmx.dr_status(cache_name, node_id=1) tiden_assert_equal({'DrStatus': 'Stopped [reason=USER_REQUEST]'}, cache_status, f'{cache_name} dr status') except AssertionError: # todo: Remove when https://ggsystems.atlassian.net/browse/GG-24383 is fixed message = 'Known issue GG-24383 - DR control.sh --dr pause is not pause replication' known_issues.append(message) log_print(message, color='red') # resume actual_resume = cu.control_utility('--dr', 'resume', str(cluster_id), '--yes').dr().parse() tiden_assert_equal(str(cluster_id), actual_resume['dc_id'], 'resume data center id') try: for cache_name in caches: cache_status = cluster.jmx.dr_status(cache_name, node_id=1) tiden_assert_equal({'DrStatus': 'Active'}, cache_status, f'{cache_name} dr status') except AssertionError: # todo: Remove when https://ggsystems.atlassian.net/browse/GG-24383 is fixed message = 'Known issue GG-24383 - DR control.sh --dr pause is not pause replication' known_issues.append(message) log_print(message, color='red') tiden_assert(not bool(known_issues), '\n'.join(known_issues))