def test_baseline_two_nodes_removed_one_added_with_loading(self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self): new_server_num = len(self.ignite.get_alive_default_nodes()) - 2 self.ignite.kill_node(2) self.ignite.kill_node(3) self.ignite.wait_for_topology_snapshot( server_num=new_server_num) self.ignite.add_additional_nodes(self.get_server_config(), 1) self.ignite.start_additional_nodes( self.ignite.get_all_additional_nodes()) self._set_baseline_few_times() self.ignite.wait_for_topology_snapshot(client_num=0) util_sleep_for_a_while(self.rebalance_timeout) self.util_verify(save_lfs_on_exception=True) tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs")
def test_baseline_adding_one_node_with_loading(self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, loading_profile=LoadingProfile(delay=1, transaction_timeout=50), skip_consistency_check=True, tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading: self.ignite.add_additional_nodes(self.get_server_config(), 1) self._sleep_and_custom_event(tx_loading, 'start nodes') self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes()) self._sleep_and_custom_event(tx_loading, 'set blt') self._set_baseline_few_times() self._sleep_and_custom_event(tx_loading, 'sleep') self._sleep_and_custom_event(tx_loading, 'end loading') metrics = tx_loading.metrics self.ignite.wait_for_topology_snapshot(client_num=0) self.create_loading_metrics_graph('test_baseline_adding_one_node_with_loading', metrics) util_sleep_for_a_while(self.rebalance_timeout) self.cu.control_utility('--cache', 'idle_verify') tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs" )
def zookeeper_fail_test(self, scenario, expecting_broken_cluster=False): node_connection = self.get_server_connections() try: with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, skip_consistency_check=True): util_sleep_for_a_while(10, msg='Wait until load started') self.zookeeper_fail_scenario(scenario) self.su.snapshot_utility('SNAPSHOT', '-type=full') self.ignite.kill_node(2) util_sleep_for_a_while(60, msg='Wait after zookeeper issue') self.ignite.start_node(2) for node_id in node_connection.keys(): tiden_assert(self.ignite.check_node_is_alive(node_id), "Node {} is expected to be alive".format(node_id)) if expecting_broken_cluster: tiden_assert( False, 'split up all zookeeper host expected to broke cluster') except Exception as e: if expecting_broken_cluster: util_sleep_for_a_while(60, msg='Wait all node segmented') for node_id in node_connection.keys(): tiden_assert( not self.ignite.check_node_is_alive(node_id), "Node {} is expected to be dead".format(node_id)) else: raise e
def test_baseline_removing_two_nodes_with_loading(self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, loading_profile=LoadingProfile(delay=1, transaction_timeout=50), tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading: new_server_num = len(self.ignite.get_alive_default_nodes()) - 2 self._sleep_and_custom_event(tx_loading, 'kill nodes') self.ignite.kill_node(2) self.ignite.kill_node(3) self.ignite.wait_for_topology_snapshot(server_num=new_server_num) self._sleep_and_custom_event(tx_loading, 'set blt') self._set_baseline_few_times() self._sleep_and_custom_event(tx_loading, 'sleep') self._sleep_and_custom_event(tx_loading, 'end loading') metrics = tx_loading.metrics self.ignite.wait_for_topology_snapshot(client_num=0) self.create_loading_metrics_graph('test_baseline_removing_two_nodes_with_loading', metrics) util_sleep_for_a_while(self.rebalance_timeout) self.cu.control_utility('--cache', 'idle_verify') tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs" )
def test_baseline_restart_node_add_one_additional_node(self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self): self.ignite.kill_node(2) self.delete_lfs(node_ids=[ 2, ]) self.ignite.start_node(2) util_sleep_for_a_while(5) new_nodes = self.ignite.add_additional_nodes( self.get_server_config(), 1) self.ignite.start_additional_nodes(new_nodes) # self.cu.kill_transactions() self._set_baseline_few_times(5) self.util_verify(save_lfs_on_exception=True) tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs")
def test_cluster_stress_tolerance(self, node_under_test, other_node, fault_combination): timeout = 15 thread_timeout = 10 take_a_rest_timeout = 10 host_under_test = node_under_test.get('host') other_host = other_node.get('host') stress = StressT(self.ssh) with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, loading_profile=LoadingProfile(delay=1, commit_possibility=1.0, transaction_timeout=5000), tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading: util_sleep_for_a_while(take_a_rest_timeout, msg='Loading warm up for') self._custom_event(tx_loading, 'start') util_sleep_for_a_while(take_a_rest_timeout) for key, value in fault_combination.items(): if value: self._sleep_and_custom_event(tx_loading, '%s start' % key) print_red('%s start' % key) if key == 'disc load' and value: stress.load_disk(node_under_test['ignite_home'], host_under_test, timeout=timeout) if key == 'network load' and value: stress.load_network(host_under_test, other_host, timeout=timeout) if key == 'cpu load' and value: stress.load_cpu(host_under_test, timeout=timeout) if key == 'ram load' and value: stress.load_ram(host_under_test, timeout=timeout) if key in ['sigstop_server', 'sigstop_client'] and value: if key == 'sigstop_server': pid = stress.get_random_server_pid(host_under_test) else: pid = stress.get_random_client_pid(host_under_test) stress.sigstop_process(host_under_test, pid, timeout=thread_timeout) if key in ['packets loss', ' packets duplicate', 'packets corrupt'] and value: stress.network_emulate_packet(host_under_test, other_host, lost_rate='5.0%', timeout=timeout, type=key.split()[-1]) self._custom_event(tx_loading, ' ') print_red('%s stop' % key) # util_sleep_for_a_while(take_a_rest_timeout, msg='Rest between tests for') util_sleep_for_a_while(take_a_rest_timeout) self._custom_event(tx_loading, 'end') metrics = tx_loading.metrics self.ignite.wait_for_topology_snapshot(client_num=0) self.create_loading_metrics_graph('test_baseline_adding_two_nodes_with_loading', metrics) self.cu.list_transactions() self.cu.control_utility('--cache', 'idle_verify') tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs" )
def test_mixed_cluster_load_caches_old_server(self): """ 1. start mixed cluster (new version servers + old version servers) 2. activate from new version control.sh 3. start old version server 4. add it to baseline 5. smoke check: 5.1. create dynamic caches from old server node 5.2. do some load from old server node """ self.ignite_new_version.cu.activate() created_caches = [] self.server_config = Ignite.config_builder.get_config( 'server', config_set_name='base') ignite = self.ignite_old_version with PiClient(ignite, self.server_config, nodes_num=1) as piclient: ignite.cu.add_node_to_baseline( ignite.get_node_consistent_id(piclient.node_ids[0])) dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') ignite = piclient.get_ignite() ignite.getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.server_config, loading_profile=LoadingProfile( delay=1, transaction_timeout=100000)): sleep(60)
def test_baseline_remove_and_back_one_node_with_additional_in_baseline( self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, loading_profile=LoadingProfile( run_for_seconds=30, delay=10)): current_server_nodes = self.ignite.get_nodes_num('server') self.start_additional_nodes( self.ignite.add_additional_nodes(self.get_server_config(), 2)) self.ignite.wait_for_topology_snapshot( server_num=current_server_nodes + 2) log_print('Kill one nodes: baseline and additional') self.ignite.kill_node(2) self.ignite.kill_node( self.ignite.get_alive_additional_nodes()[0]) log_print('Remove node from baseline') self.ignite.wait_for_topology_snapshot( server_num=current_server_nodes) # self._set_baseline_few_times() self.cu.control_utility('--baseline') self.cu.remove_node_from_baseline( self.ignite.get_node_consistent_id(2)) self.load_data_with_streamer(1001, 1501) log_print('Start node again') self.ignite.start_node(2) self.ignite.wait_for_topology_snapshot( server_num=current_server_nodes + 1) # self._set_baseline_few_times() self.cu.add_node_to_baseline( self.ignite.get_node_consistent_id(2)) self.ignite.wait_for_topology_snapshot( server_num=current_server_nodes + 1) print_red("AssertExceptions: %s" % str( self.ignite.find_exception_in_logs("java.lang.AssertionError"))) self.util_verify(save_lfs_on_exception=True) tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs")
def test_master_master_master_blinking_blt(self): self.prepare_clusters() client_config = self.preconfigure_cluster_0() iterations = 10 last_loaded_key = START_DATA_SIZE nodes_before = 6 with PiClient(self.clusters[0].grid, client_config, jvm_options=['-ea']) as piclient: PiClientIgniteUtils.load_data_with_streamer( self.clusters[0].grid, client_config, end_key=last_loaded_key, jvm_options=['-ea'], check_clients=False) sleep(60) with TransactionalLoading(self, ignite=self.clusters[0].grid, config_file=client_config, skip_consistency_check=True): for i in range(0, iterations): log_print(f'Current iteration {i + 1} from {iterations}', color='debug') self.clusters[0].grid.kill_node(2) utility_baseline_log = 'control-utility-baseline.log' self.clusters[0].grid.cu.set_current_topology_as_baseline( background=True, log=utility_baseline_log) self.clusters[0].grid.start_node(2, skip_topology_check=True) self.clusters[0].grid.wait_for_topology_snapshot( server_num=6) self.clusters[0].grid.update_started_node_status(2) self.clusters[0].grid.cu.set_current_topology_as_baseline( background=True, log=utility_baseline_log) self.verify_cluster(0, nodes_before, last_loaded_key)
def test_baseline_two_nodes_removed_one_added_with_cpu_loading(self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, loading_profile=LoadingProfile(delay=1, commit_possibility=0.2, transaction_timeout=1000), tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading: new_server_num = len(self.ignite.get_alive_default_nodes()) - 2 self._sleep_and_custom_event(tx_loading, 'kill nodes') self.ignite.kill_node(2) self.ignite.kill_node(3) self.ignite.wait_for_topology_snapshot(server_num=new_server_num) self.ignite.add_additional_nodes(self.get_server_config(), 1) self._sleep_and_custom_event(tx_loading, 'add new nodes') self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes()) self._sleep_and_custom_event(tx_loading, 'set blt') self._set_baseline_few_times() self._sleep_and_custom_event(tx_loading, 'sleep') self._sleep_and_custom_event(tx_loading, 'cpu_load') cpu_load_operation = create_cpu_load_operation(1.0, 1.0, 2) cpu_load_operation.evaluate() self._sleep_and_custom_event(tx_loading, 'cpu_load_sleep_end') cpu_load_operation.interrupt() self._sleep_and_custom_event(tx_loading, 'end loading') metrics = tx_loading.metrics self.ignite.wait_for_topology_snapshot(client_num=0) log_print(inspect.stack()[0].function) self.create_loading_metrics_graph('test_baseline_two_nodes_removed_one_added_with_loading', metrics) util_sleep_for_a_while(self.rebalance_timeout) self.cu.control_utility('--cache', 'idle_verify') tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs" )
def make_data_loading(self, duration, role='master', func_on_load=None): app = self.ignite_master_app if role == 'master' else self.ignite_replica_app with PiClient(app, self.get_client_config(role), jvm_options=self.get_dr_jvm_options(role), cu=app.cu) as piclient: with TransactionalLoading(piclient, ignite=app, skip_consistency_check=True, cross_cache_batch=100, skip_atomic=True, config_file=self.get_client_config(role), wait_before_kill=False, loading_profile=LoadingProfile(delay=1, start_key=0, end_key=100, transaction_timeout=500, run_for_seconds=duration + 10), tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading: sleep(20) if func_on_load == 'switch': self.ignite_master_app.ru.replication_utility('switch') elif func_on_load == 'bootstrap': self.ignite_master_app.ru.replication_utility('bootstrap', '-role=master -archive=ZIP -single_copy -parallelism=4 -snapshot_folder=%s/snapshot' % self.dr_storage, timeout=1200) self.ignite_replica_app.ru.replication_utility('bootstrap', '-role=replica -snapshot_folder=%s/snapshot -snapshot_id=%s' % ( self.dr_storage, self.ignite_master_app.ru.get_session_id_from_bootstrap_command()), timeout=1200) elif func_on_load == 'restart_on_load': self.ignite_replica_app.ru.replication_utility('pause') sleep(10) self.restart_ignite_grid('replica') sleep(10) self.ignite_replica_app.ru.replication_utility('resume') elif func_on_load == 'pitr': cache = piclient.get_ignite().getOrCreateCache( 'com.sbt.bm.ucp.published.api.retail.PublishedIndividual') cache.put(10000, 1) sleep(45) self.ignite_replica_app.ru.replication_utility('stop', '-recovery') sleep(duration) log_print(tx_loading.metrics['txCommit']) app.wait_for_topology_snapshot( None, 0, '' ) log_print('Loading done')
def test_loading_rest(self): with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, loading_profile=LoadingProfile(delay=1, transaction_timeout=5000), tx_metrics=True) as tx_loading: self._sleep_and_custom_event(tx_loading, 'start') util_sleep_for_a_while(100) self._sleep_and_custom_event(tx_loading, 'end') self.ignite.wait_for_topology_snapshot(client_num=0) self.create_loading_metrics_graph('test_baseline_adding_two_nodes_with_loading', tx_loading) self.cu.list_transactions() self.cu.control_utility('--cache', 'idle_verify') tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs" )
def test_old_cluster_load_caches_new_client(self): """ 1. start old version grid 2. activate from old version control.sh 3. start new version client 4. smoke check: 4.1. create dynamic caches 4.2. do some load """ created_caches = [] self.ignite_old_version.cu.activate() with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.client_config, loading_profile=LoadingProfile( delay=1, transaction_timeout=100000)): sleep(60)
def do_snapshot_bench(self): ex = None metrics = None try: with PiClient(self.ignite.ignite_cli_load, self.get_client_config()): with TransactionalLoading( self, kill_transactions_on_exit=True, cross_cache_batch=self.cross_cache_batch, skip_atomic=True, skip_consistency_check=not self.consistency_check, collect_timeout=self.collect_timeout, collect_timeout_metrics_thread=self. collect_timeout_metrics_thread, loading_profile=LoadingProfile( delay=self.tx_delay, commit_possibility=0.97, start_key=1, end_key=self.LOAD_FACTOR - 1, transaction_timeout=10000), tx_metrics=[ 'txCreated', 'txCommit', 'txRollback', 'txFailed' ]) as tx_loading: metrics = tx_loading.metrics self._prepare_before_test(tx_loading, 'snapshot') self.ignite.ignite_srvs.su.snapshot_utility( 'SNAPSHOT', '-type=FULL') self._measurements_after_test('snapshot', skip_minor_exch=0) self.ignite.ignite_srvs.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.create_loading_metrics_graph('pme_snapshot', metrics, dpi_factor=0.75) if ex: raise ex
def test_baseline_sbt_model_loading(self): with PiClient(self.ignite, self.get_client_config()): import json with open("%s/json_model.json" % self.config['rt']['test_resource_dir'], 'r') as f: model_descriptor_file = f.read() model_descriptor = json.loads(json.loads(model_descriptor_file)) caches_to_run = [item for item in model_descriptor.values()] with TransactionalLoading(self, caches_to_run=caches_to_run, kill_transactions_on_exit=True, cross_cache_batch=50, skip_atomic=True, loading_profile=LoadingProfile(delay=1, start_key=1, end_key=99, transaction_timeout=1000), tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading: self.ignite.add_additional_nodes(self.get_server_config(), 1) self._sleep_and_custom_event(tx_loading, 'start nodes') self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes()) self._sleep_and_custom_event(tx_loading, 'set blt') self._set_baseline_few_times() self._sleep_and_custom_event(tx_loading, 'sleep') self._sleep_and_custom_event(tx_loading, 'end loading') metrics = tx_loading.metrics self.ignite.wait_for_topology_snapshot(client_num=0) self.create_loading_metrics_graph('test_baseline_sbt_model_loading', metrics) util_sleep_for_a_while(self.rebalance_timeout) self.cu.control_utility('--cache', 'idle_verify') tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs" )
def test_loading_blinking_node_baseline(self): with PiClient(self.ignite, self.get_client_config()) as piclient: self.wait_for_running_clients_num(piclient.nodes_num, 90) with ExitStack() as stack: stack.enter_context( TransactionalLoading( self, cross_cache_batch=2, skip_atomic=True, post_checksum_action=self.idle_verify_action)) if is_enabled(self.config.get('zookeeper_enabled')) and \ is_enabled(self.config.get('zookeeper_nodes_restart')): stack.enter_context(ZkNodesRestart(self.zoo, 2)) for iteration in range(0, self.iterations): log_print("Iteration {}/{}".format(str(iteration + 1), str(self.iterations)), color='blue') self.assert_nodes_alive() self.ignite.kill_node(2) self.ignite.wait_for_topology_snapshot( server_num=len(self.ignite.get_alive_default_nodes())) self.cu.set_current_topology_as_baseline() util_sleep(5) self.start_node(2) self.ignite.wait_for_topology_snapshot( server_num=len(self.ignite.get_alive_default_nodes())) self.cu.set_current_topology_as_baseline() self.ignite.jmx.wait_for_finish_rebalance( self.rebalance_timeout, self.group_names) tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "# of AssertionError")
def server_segmentation_emulate(self, first_hosts_list, second_hosts_list, reverse=False): try: self.iptables_clear() self.assert_no_errors_in_utility_output() with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, skip_consistency_check=True): util_sleep_for_a_while(10, msg='Wait until load started') self.split_nodes(first_hosts_list, second_hosts_list) util_sleep_for_a_while(90, msg='Wait after network issue') util_sleep_for_a_while(5, msg='Wait after load') self.assert_no_errors_in_utility_output(tx_check=True, reverse=reverse) finally: self.iptables_clear()
def test_loading_blinking_two_nodes_blt_and_extra_node(self): with PiClient(self.ignite, self.get_client_config()): additional_node = self.ignite.add_additional_nodes( self.get_server_config())[0] self.ignite.start_additional_nodes(additional_node) with ExitStack() as stack: stack.enter_context( TransactionalLoading( self, cross_cache_batch=2, skip_atomic=True, post_checksum_action=self.idle_verify_action)) if is_enabled(self.config.get('zookeeper_enabled')) and \ is_enabled(self.config.get('zookeeper_nodes_restart')): stack.enter_context(ZkNodesRestart(self.zoo, 2)) for iteration in range(0, self.iterations): log_print("Iteration {}/{}".format(str(iteration + 1), str(self.iterations)), color='blue') self.assert_nodes_alive() self.ignite.kill_node(2) self.ignite.kill_node(additional_node) # self.ignite.start_node(2) # self.ignite.start_additional_nodes(additional_node) self.start_node(2) self.start_node(additional_node) self.ignite.jmx.wait_for_finish_rebalance( self.rebalance_timeout, self.group_names) tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "# of AssertionError")
def server_check_cluster_behaviour(self, node_segmented_group): """ Func got expected segmented group: 1. check that all nodes in this group are dead 2. start all these nodes 3. wait rebalance timeout 4. check there was no data corruption: - call idle_verify - try to do some loading - call idle_verify again and check transactions :param node_segmented_group: group of nodes expected to be dead """ # check all nodes are dead for node_id in node_segmented_group.keys(): tiden_assert( not self.ignite.check_node_is_alive(node_id), "Node {} is expected to be dead".format( node_segmented_group.get(node_id))) second_hosts_node_ids = [ int(node) for node in node_segmented_group.keys() ] # start all nodes and wait for rebalance completed self.ignite.start_nodes(*second_hosts_node_ids, force=True) util_sleep_for_a_while(90, msg='Wait rebalance timeout') # check idle verify does not return any errors self.assert_no_errors_in_utility_output() # check with some loading with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading(self, skip_consistency_check=True): util_sleep_for_a_while(15, msg='Little load') util_sleep_for_a_while(5, msg='Wait after load') self.assert_no_errors_in_utility_output(tx_check=True)
def _run_iteration(self, ignite, iteration): """ One iteration of clients PME benchmark is as follows: 1. start transactional loading at `loading_clients_hosts`, sleep `warmup_clients_delay` so load stabilize 2. start `num_clients_to_kill` clients at `clients_hosts` (different from `loading_clients_hosts`) measure JOIN exchange time, sleep `stabilization_delay` 3. stop started additional clients, measure LEAVE exchange time, sleep `cooldown_delay` :param ignite: :param iteration: :return: """ log_print("===> PME {} Clients(s) Left-Join Benchmark iteration {}/{} artifact started ".format( self.config['num_clients_to_kill'], iteration, self.config['iterations'] ), color='green') loading_client_hosts = self._get_loading_client_hosts() client_hosts = self._get_client_hosts(loading_client_hosts) num_clients = self.config['num_clients_to_kill'] metrics = None ex = None x1_join_time = None x1_leave_time = None try: # start loading clients with PiClient( ignite, self.test_class.client_config, client_hosts=loading_client_hosts, clients_per_host=self.config.get('loading_clients_per_host', 1) ): # initiate transactional loading with TransactionalLoading( self.test_class, ignite=ignite, kill_transactions_on_exit=self.config['kill_transactions_on_exit'], cross_cache_batch=self.config['cross_cache_batch'], skip_atomic=self.config['skip_atomic'], skip_consistency_check=not self.config['consistency_check_enabled'], loading_profile=LoadingProfile( delay=self.config['tx_delay'], commit_possibility=self.config['commit_possibility'], start_key=1, end_key=self.config['load_factor'] - 1, transaction_timeout=self.config['transaction_timeout'] ), tx_metrics=['txCreated', 'txCommit', 'txFailed', 'txRollback'] ) as tx_loading: metrics = tx_loading.metrics util_sleep_for_a_while(self.config['warmup_clients_delay'], "Before JOIN") current_clients_num = ignite.get_nodes_num('client') expected_total_clients_num = current_clients_num + num_clients self.test_class._prepare_before_test(ignite, tx_loading, 'JOIN %d client(s)' % num_clients) # start num_clients client nodes on 'flaky' hosts with PiClient( ignite, self.test_class.client_config, client_hosts=client_hosts, clients_per_host=self.config.get('clients_per_host', 1), nodes_num=num_clients, new_instance=True, ): ignite.wait_for_topology_snapshot(client_num=expected_total_clients_num, timeout=600, check_only_servers=True, exclude_nodes_from_check=[]) tx_loading.metrics_thread.add_custom_event('%d client(s) joined' % num_clients) new_topVer = self.test_class._get_new_top_after_test(ignite) self.test_class._wait_exchange_finished(ignite, new_topVer) x1_join_time, x2_time = self.test_class._measurements_after_test('JOIN %d client(s)' % num_clients, skip_exch=1) util_sleep_for_a_while(self.config['stabilization_delay']) # upon exit from with block, num_clients client nodes will be killed self.test_class._prepare_before_test(ignite, tx_loading, 'LEAVE %d client(s)' % num_clients) ignite.wait_for_topology_snapshot(client_num=current_clients_num, timeout=600, check_only_servers=True, exclude_nodes_from_check=[]) tx_loading.metrics_thread.add_custom_event('%d client(s) left' % num_clients) new_topVer = self.test_class._get_new_top_after_test(ignite) self.test_class._wait_exchange_finished(ignite, new_topVer) x1_leave_time, x2_time = self.test_class._measurements_after_test('LEAVE %d client(s)' % num_clients, skip_exch=1) util_sleep_for_a_while(self.config['cooldown_delay']) ignite.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.test_class.create_loading_metrics_graph( 'pme_%d_clients_left_join_%s_%d' % (num_clients, self.run_id, iteration), metrics, dpi_factor=0.75 ) if ex: raise ex return { 'Exchange Client Join': x1_join_time, 'Exchange Client Leave': x1_leave_time, }
def test_indexes_rebuilded(self): """ https://ggsystems.atlassian.net/browse/GG-17428 1. Start cluster. 2. Start transactional loading. 3. Stop one node and remove index.bin files for the caches. 4. Start node and let it finish rebalance. 5. Check indexes are not broken after rebalance. :return: """ self.need_delete_lfs_on_teardown = True debug = False with PiClient(self.ignite, self.get_client_config()) as piclient: self.wait_for_running_clients_num(piclient.nodes_num, 90) with ExitStack() as stack: # todo unreachable code if False: stack.enter_context( TransactionalLoading( self, cross_cache_batch=2, skip_atomic=True, post_checksum_action=self.idle_verify_action)) if is_enabled(self.config.get('zookeeper_enabled')) and \ is_enabled(self.config.get('zookeeper_nodes_restart')): stack.enter_context(ZkNodesRestart(self.zoo, 2)) for iteration in range(0, self.iterations): log_print("Iteration {}/{}".format(str(iteration + 1), str(self.iterations)), color='blue') self.assert_nodes_alive() with TransactionalLoading(self, cross_cache_batch=2, skip_atomic=True): util_sleep(20) self.ignite.kill_node(2) if debug: self.cu.control_utility( '--cache idle_verify --dump --skip-zeros') self.remove_index_bin_files(2) util_sleep(10) if debug: self.cu.control_utility( '--cache idle_verify --dump --skip-zeros') self.start_node(2) self.ignite.jmx.wait_for_finish_rebalance( self.rebalance_timeout, self.group_names) util_sleep(30) log_print("Check indexes") try: if debug: self.cu.control_utility( '--cache idle_verify --dump --skip-zeros') self.idle_verify_action(None) except TidenException: if debug: self.cu.control_utility( '--cache idle_verify --dump --skip-zeros') raise TidenException('validate_index failed') tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "# of AssertionError")
def run(self, artifact_name): """ Run rebalance scenario for defined artifact Scenario is very simple 1. start cluster 2. load data to one cache with backups until size reached 'data_size_kb' from config (5GB is optimal) 3. start or skip loading 4. kill node with cache, clean lfs, start node again 5. using JMX utility wait until LocalNodeMovingPartitionsCount for cache will be 0 6. save this value and divide by spent time Also netstat metrics collected while running this scenario (In this case we don't need separate probe to collect more precise metrics) :param artifact_name: name from artifact configuration file """ super().run(artifact_name) log_print("Running rebalance benchmark with config: %s" % self.config, color='green') version = self.test_class.tiden.config['artifacts'][artifact_name]['ignite_version'] ignite = None try: # collect properties from config self.initialize_config() in_memory = self.config.get('in_memory', False) xml_config_set_name = 'caches_%s.xml' % REBALANCE_CONFIG_SET \ if 'single_cache' not in self.config else 'single_cache_%s.xml' % REBALANCE_CONFIG_SET self.test_class.create_app_config_set( Ignite, REBALANCE_CONFIG_SET, deploy=True, caches_list_file=xml_config_set_name, snapshots_enabled=True, logger=True, wal_segment_size=self.test_class.consumption_config.get('wal_segment_size', 64 * 1024 * 1024), logger_path='%s/ignite-log4j2.xml' % self.test_class.tiden.config['rt']['remote']['test_module_dir'], disabled_cache_configs=False, zookeeper_enabled=False, rebalance_pool_size=self.config.get('rebalance_pool_size', 8), system_pool_size=self.config.get('rebalance_pool_size', 8) + 8, checkpoint_read_lock_timeout=self.read_lock_property_value(version), wal_compaction_enabled=self.artifact_config_variables.get('wal_compaction_enabled', False), # caches related variables additional_configs=['caches.tmpl.xml', ] if 'single_cache' not in self.config else [ 'single_cache.tmpl.xml', ], partitions=5 if self.parts_distribution else 1024, part_32=self.test_class.consumption_config.get('part_32', 32), # see cache.tmpl.xml for more details part_64=self.test_class.consumption_config.get('part_64', 64), part_128=self.test_class.consumption_config.get('part_64', 128), in_memory=in_memory, backups=self.config.get('backups', 0), load_type=self.load_type, ) # run ignite app keys_to_load = int(self.config.get('keys_to_load')) ignite, last_end_key, version = self.start_cluster_with_data(keys_to_load, False) ignite.set_snapshot_timeout(600) # wait for checkpoint sleep(CHECKPOINT_SLEEP) # dump idle_verify if need and no loading dump_before = None if self.idle_verify and not self.with_loading: dump_before = ignite.cu.idle_verify_dump() self.start_probes(artifact_name) warmup_runs, prod_runs = self._get_number_of_runs() # run rebalance calculation if self.with_loading: client_config = Ignite.config_builder.get_config('client', config_set_name=REBALANCE_CONFIG_SET) with PiClient(ignite, client_config, jvm_options=['-DPICLIENT_OPERATIONS_POOL_SIZE=64']) as piclient: if self.parts_distribution: # for partition distribution we need to pass config_loading_dict cache_load_map = { CACHE_NAME: { # following keys are key generator builder arguments # we build java object later, when we will knew exact gateway 'key_generator_builder': AffinityPartitionKeyGeneratorBuilder( CACHE_NAME, self.parts_distribution, 1, keys_to_load, ).set_collision_possibility(0.5), # this is metrics postfix (need to separate different caches in plot) 'metric_postfix': 'rebalance', # metrics postfix for plot }, } else: cache_load_map = { CACHE_NAME: { # following keys are key generator builder arguments # we build java object later, when we will knew exact gateway 'key_generator_builder': AffinityCountKeyGeneratorBuilder( CACHE_NAME, ignite.get_node_consistent_id( NODE_TO_REBALANCE), 1, keys_to_load, True ).set_collision_possibility(0.5), # this is metrics postfix (need to separate different caches in plot) 'metric_postfix': 'rebalance', # metrics postfix for plot }, } caches_to_load = [CACHE_NAME, ] # define tx_metrics for TransactionalLoading tx_metrics = [ 'txCreated_rebalance', 'txFailed_rebalance', ] if self.with_no_rebalance_cache: # this cache will not be on NODE_TO_REBALANCE but will be under transactionalLoading cache_load_map[CACHE_NAME_NOT_IN_REBALANCE] = { # following keys are key generator builder arguments # we build java object later, when we will knew exact gateway 'key_generator_builder': AffinityCountKeyGeneratorBuilder( CACHE_NAME_NOT_IN_REBALANCE, ignite.get_node_consistent_id( NODE_TO_REBALANCE), 1, keys_to_load, False ).set_collision_possibility(0.5), # this is metrics postfix (need to separate different caches in plot) 'metric_postfix': 'no_rebalance', # metrics postfix for plot } caches_to_load.append(CACHE_NAME_NOT_IN_REBALANCE) # mutate tx_metrics for TransactionalLoading tx_metrics.append('txCreated_no_rebalance') tx_metrics.append('txFailed_no_rebalance') with TransactionalLoading(self.test_class, ignite=ignite, cu=ignite.cu, config_file=client_config, caches_to_run=caches_to_load, skip_consistency_check=True, cross_cache_batch=1, cache_load_map=cache_load_map, keys_count=keys_to_load, # multiply execution operations, because we load only in 1 or 2 caches load_threads=16 * piclient.nodes_num if self.single_cache else None, collect_timeout=5000, collision_possibility=0.5, with_exception=False, loading_profile=LoadingProfile(commit_possibility=0.8, end_key=last_end_key), tx_metrics=tx_metrics ) as tx_loading: LoadingUtils.sleep_and_custom_event(tx_loading, 'Sleep before test', self.metrics_idle) # define snapshot timeout for rebalance on loading ignite.snapshot_timeout = 600 rebalance_speed = \ self.calculate_rebalance_speed( ignite, prod_runs, warmup_runs, last_end_key, keys_to_load, tx_loading=tx_loading, ) metrics = tx_loading.metrics LoadingUtils.create_loading_metrics_graph(self.test_class.config['suite_var_dir'], 'rebalance_%s_%s' % ( version, 'loading' if self.with_loading else '' ), metrics) else: rebalance_speed = self.calculate_rebalance_speed( ignite, prod_runs, warmup_runs, last_end_key, keys_to_load, version=version, ) # dump idle_verify if need and no loading if self.idle_verify and not self.with_loading: dump_after = ignite.cu.idle_verify_dump() if dump_after != dump_before: log_print('Failed idle_verify additional check', color='red') ignite.cu.deactivate() self.stop_probes(speed=rebalance_speed) self.results['evaluated'] = True finally: if ignite: self.kill_cluster(ignite) # remove config set self.test_class.remove_app_config_set(Ignite, REBALANCE_CONFIG_SET)
def test_during_loading(self): """ Should be fully fixed in 8.5.8-p1 Scenario: 1. Start 3 server nodes 2. Load 1000 keys into 120 TX caches 3. Start 3 client node and start TX loading (PESSIMISTIC/REPEATABLE_READ, OPTIMISTIC/SERIALIZABLE) (12 transfer operations, 10 caches in each operation, 1000ms between each transaction i.e. ~ 4 tx per second from each client)) 4. In clients try to destroy caches 5. Interesting things happens Fixed in 8.5.8-p1 https://ggsystems.atlassian.net/browse/GG-19179 Issues that was found during this test: https://ggsystems.atlassian.net/browse/GG-19411 https://ggsystems.atlassian.net/browse/GG-19383 :return: """ PiClient.read_timeout = 600 ignite = self.start_ignite_grid(self.ignite_name) ignite.cu.activate(activate_on_particular_node=1) PiClientIgniteUtils.load_data_with_putall(ignite, self.client_config, ) def get_dumps(): for node_id in ignite.nodes.keys(): self.util_get_threads_from_jstack(ignite, node_id, 'END') try: with PiClient(ignite, self.client_config) as piclient: with TransactionalLoading(self, ignite=ignite, config_file=self.client_config, on_exit_action=get_dumps, kill_transactions_on_exit=True, with_exception=False, # do interrupt loading operation if smth happens? skip_consistency_check=True, # we are destroying caches here if you notice loading_profile=LoadingProfile( delay=1000, allowed_transactions=( TxDescriptor(concurrency='OPTIMISTIC', isolation='SERIALIZABLE', ),) )): # allowed_transactions=(TxDescriptor(), ))): # )): node_id = piclient.get_node_id() client_ignite = piclient.get_ignite(node_id) cache_names = client_ignite.cacheNames().toArray() caches_to_kill_num = 50 frags = 0 for cache in cache_names: node_id = piclient.get_node_id() log_print('Destroying cache %s on node %s' % (cache, node_id), color='red') piclient.get_ignite(node_id).cache(cache).destroy() frags += 1 if frags >= caches_to_kill_num: break finally: npe_errors = ignite.find_exception_in_logs(".*java.lang.NullPointerException.*") assertion_errors = ignite.find_exception_in_logs(".*java.lang.AssertionError.*") if npe_errors != 0 or assertion_errors != 0: assert False, "There are errors in logs: NPE - %s, AE - %s" % (npe_errors, assertion_errors)
def _run_iteration(self, ignite, iteration): """ One iteration of server PME benchmark is as follows: 1. start transactional loading, sleep `warmup_servers_delay` so that load stabilize 2. kill random N nodes, measure LEAVE exchange time, sleep `stabilization_delay` 3. restart killed nodes, measure JOIN exchange time, sleep `cooldown_delay` 4. stop load :param ignite: :param iteration: :return: """ log_print( "===> PME {} Server(s) Left-Join Benchmark iteration {}/{} started " .format(self.config['num_servers_to_kill'], iteration, self.config['iterations']), color='green') # if debug: # from pt.util import read_yaml_file # from os.path import join # base_path = 'pt/tests/res/exchanges' # exch_test = iteration # start_exch = read_yaml_file(join(base_path, 'start_exch.%d.yaml' % exch_test)) # finish_exch = read_yaml_file(join(base_path, 'finish_exch.%d.yaml' % exch_test)) # merge_exch = read_yaml_file(join(base_path, 'merge_exch.%d.yaml' % exch_test)) # self.test_class.exchanges = ExchangesCollection.create_from_log_data(start_exch, finish_exch, merge_exch) # self.test_class.new_topVer = 5 # x1_leave_time, x2_time = self.test_class._measurements_after_test('test_leave', skip_exch=1) # self.test_class.new_topVer = 6 # x1_join_time, x2_time = self.test_class._measurements_after_test('test_join', skip_exch=1) # # return x1_leave_time, x1_join_time loading_client_hosts = self._get_loading_client_hosts() num_servers = self._get_num_server_nodes() num_servers_to_kill = self.config['num_servers_to_kill'] kill_coordinator = self.config['kill_coordinator'] metrics = None ex = None x1_join_time = None x1_leave_time = None try: # start loading clients ... with PiClient(ignite, self.test_class.client_config, client_hosts=loading_client_hosts, clients_per_host=self.config.get( 'loading_clients_per_host', 1)): # ... and initiate transactional load with TransactionalLoading( self.test_class, ignite=ignite, kill_transactions_on_exit=self. config['kill_transactions_on_exit'], cross_cache_batch=self.config['cross_cache_batch'], skip_atomic=self.config['skip_atomic'], skip_consistency_check=not self. config['consistency_check_enabled'], loading_profile=LoadingProfile( delay=self.config['tx_delay'], commit_possibility=self. config['commit_possibility'], start_key=1, end_key=self.config['load_factor'] - 1, transaction_timeout=self. config['transaction_timeout']), tx_metrics=[ 'txCreated', 'txCommit', 'txFailed', 'txRollback' ]) as tx_loading: metrics = tx_loading.metrics # pick random server nodes node_ids = ignite.get_random_server_nodes( num_servers_to_kill, use_coordinator=kill_coordinator, node_ids=self.test_class.server_node_ids, ) expected_total_server_num = num_servers - len(node_ids) # ... wait load stabilize util_sleep_for_a_while(self.config['warmup_servers_delay'], "Before LEAVE") if is_enabled(self.config.get('jfr_enabled', False)): ignite.make_cluster_jfr(60) util_sleep_for_a_while(2) self.test_class._prepare_before_test( ignite, tx_loading, 'LEAVE %d server(s)' % len(node_ids)) # ... kill selected random nodes ignite.kill_nodes(*node_ids) ignite.wait_for_topology_snapshot( server_num=expected_total_server_num) tx_loading.metrics_thread.add_custom_event( '%d server(s) left' % len(node_ids)) new_topVer = self.test_class._get_new_top_after_test( ignite) self.test_class._wait_exchange_finished(ignite, new_topVer) x1_leave_time, x2_time = self.test_class._measurements_after_test( 'LEAVE %d server(s)' % len(node_ids), skip_exch=1) if is_enabled(self.config.get('heapdump_enabled', False)): ignite.make_cluster_heapdump( [1], 'after_%d_server_leave' % len(node_ids)) # ... wait exchange stabilize util_sleep_for_a_while(self.config['stabilization_delay'], "After LEAVE, before JOIN") if self.config['measure_restart_nodes']: self.test_class._prepare_before_test( ignite, tx_loading, 'JOIN %d server(s)' % len(node_ids)) # ... restart killed nodes ignite.start_nodes(*node_ids) ignite.wait_for_topology_snapshot( server_num=expected_total_server_num + len(node_ids)) if self.config['measure_restart_nodes']: tx_loading.metrics_thread.add_custom_event( '%d server(s) joined' % len(node_ids)) new_topVer = self.test_class._get_new_top_after_test( ignite) self.test_class._wait_exchange_finished( ignite, new_topVer) x1_join_time, x2_time = self.test_class._measurements_after_test( 'JOIN %d server(s)' % len(node_ids), skip_exch=1) # if is_enabled(self.config.get('heapdump_enabled', False)): # ignite.make_cluster_heapdump([1], 'after_%d_server_join' % len(node_ids)) # ... wait exchange cooldown util_sleep_for_a_while(self.config['cooldown_delay'], "After JOIN") ignite.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.test_class.create_loading_metrics_graph( 'pme_%d_servers_left_join_%s_%d' % (num_servers_to_kill, self.run_id, iteration), metrics, dpi_factor=0.75) if ex: raise ex return { 'Exchange Server Join': x1_join_time, 'Exchange Server Leave': x1_leave_time, }
def do_pme_server_bench(self, num_servers, kill_coordinator=False): metrics = None ex = None self.ignite.ignite_srvs.make_cluster_heapdump([1], 'before_load') try: with PiClient(self.ignite.ignite_cli_load, self.get_client_config()): with TransactionalLoading( self, kill_transactions_on_exit=True, cross_cache_batch=self.cross_cache_batch, skip_atomic=True, skip_consistency_check=not self.consistency_check, loading_profile=LoadingProfile( delay=self.tx_delay, commit_possibility=0.97, start_key=1, end_key=self.LOAD_FACTOR - 1, transaction_timeout=1000), tx_metrics=[ 'txCreated', 'txCommit', 'txRollback', 'txFailed' ]) as tx_loading: metrics = tx_loading.metrics node_ids = self.ignite.ignite_srvs.get_random_server_nodes( num_servers, use_coordinator=kill_coordinator) expected_total_server_num = len( self.ignite.get_all_default_nodes()) - len(node_ids) self._prepare_before_test( tx_loading, 'LEAVE %d server(s)' % len(node_ids)) self.ignite.ignite_srvs.make_cluster_jfr(60) util_sleep_for_a_while(2) self.ignite.ignite_srvs.kill_nodes(*node_ids) self.ignite.ignite_srvs.wait_for_topology_snapshot( server_num=expected_total_server_num) tx_loading.metrics_thread.add_custom_event( '%d server(s) left' % len(node_ids)) self._measurements_after_test('LEAVE %d server(s)' % len(node_ids), skip_exch=1) # self.ssh.exec_on_host(self.ignite.ignite_srvs.nodes[1]['host'], [ # 'jmap -dump:format=b,file={testdir}/heapdump.{pid}.hprof {pid}'.format( # testdir=self.config['rt']['remote']['test_dir'], # pid=self.ignite.ignite_srvs.nodes[1]['PID'], # ) # ]) self.ignite.ignite_srvs.make_cluster_heapdump( [1], 'after_server_leave') util_sleep_for_a_while(self.stabilization_time) self._prepare_before_test( tx_loading, 'JOIN %d server(s)' % len(node_ids)) self.ignite.ignite_srvs.start_nodes(*node_ids) self.ignite.ignite_srvs.wait_for_topology_snapshot( server_num=expected_total_server_num + len(node_ids)) tx_loading.metrics_thread.add_custom_event( '%d server(s) joined' % len(node_ids)) util_sleep_for_a_while(int(3 * self.LOAD_FACTOR / 1000)) self._measurements_after_test('JOIN %d server(s)' % len(node_ids), skip_exch=1) self.ignite.ignite_srvs.make_cluster_heapdump( [1], 'after_server_join') util_sleep_for_a_while(self.stabilization_time) self.ignite.ignite_srvs.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.create_loading_metrics_graph('pme_%d_servers_left_join' % num_servers, metrics, dpi_factor=0.75) if ex: raise ex
def do_pme_client_bench(self, num_clients): metrics = None ex = None try: with PiClient(self.ignite.ignite_cli_load, self.get_client_config()): with TransactionalLoading( self, kill_transactions_on_exit=True, cross_cache_batch=self.cross_cache_batch, skip_atomic=True, skip_consistency_check=not self.consistency_check, loading_profile=LoadingProfile( delay=self.tx_delay, commit_possibility=0.97, start_key=1, end_key=self.LOAD_FACTOR - 1, transaction_timeout=1000), tx_metrics=[ 'txCreated', 'txCommit', 'txRollback', 'txFailed' ]) as tx_loading: metrics = tx_loading.metrics expected_total_num_clients = len( self.ignite.get_all_client_nodes() + self.ignite.get_all_common_nodes()) self._prepare_before_test( tx_loading, 'JOIN %d client(s)' % num_clients) # start num_clients client nodes on 'flaky' hosts with PiClient(self.ignite.ignite_cli_flaky, self.get_client_config(), nodes_num=num_clients, new_instance=True): self.ignite.ignite_srvs.wait_for_topology_snapshot( client_num=expected_total_num_clients + num_clients) tx_loading.metrics_thread.add_custom_event( '%d client(s) joined' % num_clients) self._measurements_after_test('JOIN %d client(s)' % num_clients, skip_exch=1) util_sleep_for_a_while(self.stabilization_time) # upon exit from with block, num_clients client nodes will be killed self._prepare_before_test( tx_loading, 'LEAVE %d client(s)' % num_clients) self.ignite.ignite_srvs.wait_for_topology_snapshot( client_num=expected_total_num_clients) tx_loading.metrics_thread.add_custom_event( '%d client(s) left' % num_clients) self._measurements_after_test('LEAVE %d client(s)' % num_clients, skip_exch=1) util_sleep_for_a_while(self.stabilization_time) self.ignite.ignite_srvs.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.create_loading_metrics_graph('pme_%d_clients_join_left' % num_clients, metrics, dpi_factor=0.75) if ex: raise ex
def test_full_cluster_blinking(self): """ Enable indexes Start servers with PDS, start clients, start some light tx loading. In loop try to blink with all cluster at the same time. Logically there should be no data loss: full cluster blinking - so there shouldn't be any data loss :return: """ PiClient.read_timeout = 240 self.set_current_context('indexed_types') self.util_copy_piclient_model_to_libs() self.ignite.set_activation_timeout(240) self.ignite.set_snapshot_timeout(240) self.ignite.set_node_option('*', 'jvm_options', ['-ea']) self.su.clear_snapshots_list() self.start_grid(skip_activation=True) self.ignite.cu.activate(activate_on_particular_node=1) PiClientIgniteUtils.load_data_with_streamer(self.ignite, self.get_client_config(), end_key=100000) nodes_before = self.ignite.get_alive_default_nodes() iterations = 50 with PiClient(self.ignite, self.get_client_config()): with TransactionalLoading( self, loading_profile=LoadingProfile(delay=1000)): for i in range(0, iterations): log_print('Current iteration %s from %s' % (i, iterations), color='debug') for node_id in nodes_before: self.ignite.kill_node(node_id) sleep( float(self.the_glue_timeout) if self. the_glue_timeout else round( random.uniform(0.1, 0.5), 1)) for node_id in nodes_before: self.ignite.start_node(node_id, skip_topology_check=True) sleep( float(self.the_glue_timeout) if self. the_glue_timeout else round( random.uniform(0.1, 0.5), 1)) self.ignite.wait_for_topology_snapshot( server_num=len(nodes_before)) for node_id in self.ignite.get_all_default_nodes(): self.ignite.update_started_node_status(node_id) sleep(10) self.cu.control_utility('--cache validate_indexes', all_required='no issues found.') self.verify_cluster(nodes_before, 0)
def test_24_fitness_rolling_upgrade(self): """ This test checks the main rolling upgrade scenario under the load: 1. Old cluster up and running (consistent_id's are not set). 2. First cycle (upgrade to new version and set property GG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR): 3. Second cycle (set correct consistent_id with adding to baseline topology). """ created_caches = [] self.ignite_old_version.cu.activate() with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(60) with PiClient(self.ignite_old_version, self.client_config, nodes_num=4) as piclient: cache_names = piclient.get_ignite().cacheNames() # Start transaction loading for TTL caches with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.client_config, loading_profile=LoadingProfile( delay=0, transaction_timeout=100000, run_for_seconds=600)): util_sleep_for_a_while(20) log_print('Rolling upgrade', color='green') async_ops = [] for cache_name in [ cache_name for cache_name in cache_names.toArray() if cache_name.startswith("M2_PRODUCT") ]: async_operation = create_async_operation( create_put_all_operation, cache_name, 1001, 400001, 10, value_type=ModelTypes.VALUE_ALL_TYPES.value) async_ops.append(async_operation) async_operation.evaluate() # First cycle: upgrade version and set property. for i in range(1, 5): self.ignite_old_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') self.ignite_old_version.kill_nodes(i) self.ignite_new_version.cleanup_work_dir(i) folder = self.ignite_old_version.get_work_dir(i) log_print(folder, color='debug') self.ignite_new_version.copy_work_dir_from(i, folder) jvm_options = self.ignite_new_version.get_jvm_options(i) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=true' ) util_sleep_for_a_while(10) self.ignite_new_version.start_nodes(i, already_nodes=(4 - i), other_nodes=(4 - i), timeout=240) self.ignite_new_version.cu.control_utility('--baseline') for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(30) log_print('Change consistent ID', color='green') self.ignite_new_version.set_node_option( '*', 'config', Ignite.config_builder.get_config( 'server', config_set_name='24_fit_with_consist_id')) # Second cycle - change consistent_id and add to baseline topology. for i in range(1, 5): self.ignite_new_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') self.ignite_new_version.kill_nodes(i) log_print( "Starting node {} with new consistent id".format(i), color='debug') self.ignite_new_version.start_nodes(i, timeout=240) log_print("Changing baseline", color='debug') self.ignite_new_version.cu.set_current_topology_as_baseline( ) util_sleep_for_a_while( 60, msg='Wait for rebalance to completed') log_print('Transactional loading done', color='green') # Just to check client node still can interact with cluster - calculate checksum from client node. sorted_cache_names = [] for cache_name in piclient.get_ignite().cacheNames().toArray(): sorted_cache_names.append(cache_name) sorted_cache_names.sort() async_operations = [] cache_operation = {} for cache_name in sorted_cache_names: async_operation = create_async_operation( create_checksum_operation, cache_name, 1, 10000) async_operations.append(async_operation) cache_operation[async_operation] = cache_name async_operation.evaluate() checksums = '' cache_checksum = {} for async_operation in async_operations: result = str(async_operation.getResult()) cache_checksum[cache_operation.get(async_operation)] = result checksums += result log_print('Calculating checksums done')