def test_mixed_cluster_load_caches_old_server(self): """ 1. start mixed cluster (new version servers + old version servers) 2. activate from new version control.sh 3. start old version server 4. add it to baseline 5. smoke check: 5.1. create dynamic caches from old server node 5.2. do some load from old server node """ self.ignite_new_version.cu.activate() created_caches = [] self.server_config = Ignite.config_builder.get_config( 'server', config_set_name='base') ignite = self.ignite_old_version with PiClient(ignite, self.server_config, nodes_num=1) as piclient: ignite.cu.add_node_to_baseline( ignite.get_node_consistent_id(piclient.node_ids[0])) dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') ignite = piclient.get_ignite() ignite.getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.server_config, loading_profile=LoadingProfile( delay=1, transaction_timeout=100000)): sleep(60)
def test_baseline_auto_activation(self): with PiClient(self.ignite, self.get_client_config()) as piclient: cache_names = piclient.get_ignite().cacheNames() async_operations = [] for cache_name in cache_names.toArray(): async_operation = create_async_operation( create_cross_cache_account_runner_operation, cache_name, 1, 1000, 0.5, delay=1, run_for_seconds=20) async_operations.append(async_operation) async_operation.evaluate() # wait operations to complete for async_op in async_operations: print(async_op.getResult()) alive_default_nodes = self.ignite.get_alive_default_nodes() for node_id in alive_default_nodes: self.ignite.kill_node(node_id) util_sleep_for_a_while(5) self.ignite.start_nodes() self.ignite.add_additional_nodes(self.get_server_config(), 1) self.ignite.start_additional_nodes( self.ignite.get_all_additional_nodes()) print_red("AssertExceptions: %s" % str( self.ignite.find_exception_in_logs("java.lang.AssertionError"))) self.cu.control_utility('--baseline') assert 'Cluster state: active' in self.cu.latest_utility_output, \ 'Cluster is inactive' util_sleep_for_a_while(self.rebalance_timeout) self.util_verify(save_lfs_on_exception=True) tiden_assert_equal( 0, self.ignite.find_exception_in_logs('java.lang.AssertionError'), "No AssertionError in logs")
def calc_checksums_on_client(piclient, start_key=0, end_key=1000, dict_mode=False): """ Calculate checksum based on piclient :param start_key: start key :param end_key: end key :param dict_mode: :return: """ log_print("Calculating checksums using cache.get() from client") cache_operation = {} cache_checksum = {} sorted_cache_names = [] for cache_name in piclient.get_ignite().cacheNames().toArray(): sorted_cache_names.append(cache_name) sorted_cache_names.sort() async_operations = [] for cache_name in sorted_cache_names: async_operation = create_async_operation( create_checksum_operation, cache_name, start_key, end_key, gateway=piclient.get_gateway()) async_operations.append(async_operation) cache_operation[async_operation] = cache_name async_operation.evaluate() checksums = '' for async_operation in async_operations: result = str(async_operation.getResult()) cache_checksum[cache_operation.get(async_operation)] = result checksums += result log_print('Calculating checksums done') if dict_mode: return cache_checksum else: return checksums
def test_old_cluster_load_caches_new_client(self): """ 1. start old version grid 2. activate from old version control.sh 3. start new version client 4. smoke check: 4.1. create dynamic caches 4.2. do some load """ created_caches = [] self.ignite_old_version.cu.activate() with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.client_config, loading_profile=LoadingProfile( delay=1, transaction_timeout=100000)): sleep(60)
def was_snapshots(self): """ Check previous run for any snapshot tasks Wait for snapshot creation or restore ends """ end_time = time() + 60 * 4 if len(self.context['history']) < 2: return if [ o for o in self.context['history'][-2]['operations'] if 'snapshot' in o ]: while True: try: with PiClient(self.cluster, self.client_config, new_instance=True, name='snapshot_wait', exception_print=False, read_timeout=60 * 4) as piclient: op = create_async_operation( wait_snapshot_operation, 100, gateway=piclient.get_gateway()) op.evaluate() while True: if op.getStatus().toString() == "FINISHED": if bool(op.getResult()): return if time() > end_time: log_print( 'failed to wait snapshot wait operation finished', color='red') return except: log_print('snapshot wait failed', color='red') sleep(5) if time() > end_time: log_print( 'failed to wait piclient start to wait snapshot execution', color='red') return
def _create_dynamic_caches_with_data(self, with_index=False): log_print("Create dynamic caches and load data") data_model = ModelTypes.VALUE_ALL_TYPES.value created_caches = [] with PiClient(self.ignite, self.get_client_config(), nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method print_green('Loading %s...' % cache_name) gateway = piclient.get_gateway() ignite = piclient.get_ignite() ignite.getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name, gateway=gateway)) if with_index: data_model = ModelTypes.VALUE_ALL_TYPES.value async_operation = create_async_operation( create_streamer_operation, cache_name, 1, self.max_key + 2, value_type=data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='blue') # wait for streamer to complete for async_op in async_ops: async_op.getResult() log_print("Dynamic caches with data created") return created_caches
def util_run_money_transfer_task(self, time_to_run=10): log_print("Starting money transfer task", color='green') with PiClient(self.ignite, self.get_client_config()) as piclient: cache_names = piclient.get_ignite().cacheNames() async_operations = [] for cache_name in cache_names.toArray(): async_operation = create_async_operation( create_account_runner_operation, cache_name, 1, self.max_key, 0.5, delay=1, run_for_seconds=time_to_run) async_operations.append(async_operation) async_operation.evaluate() # wait operations to complete for async_op in async_operations: async_op.getResult() log_print("Money transfer is done", color='green')
def _calc_checksums_over_dynamic_caches(self): log_print("Calculating checksums") with PiClient(self.ignite, self.get_client_config()): dynamic_caches_factory = DynamicCachesFactory() async_operations = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method checksum_operation = create_checksum_operation( cache_name, 1, 1000) async_operation = create_async_operation(checksum_operation) async_operations.append(async_operation) async_operation.evaluate() checksums = '' for async_operation in async_operations: checksums += str(async_operation.getResult()) log_print("Calculating checksums done") return checksums
def test_24_fitness_two_clients_with_snapshot(self): """ """ created_caches = [] self.ignite_new_version.cu.activate() with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient_new: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient_new.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() with PiClient(self.ignite_old_version, self.client_config, nodes_num=2, new_instance=True) as piclient_old: log_print('Rolling upgrade', color='green') self.ignite_new_version.su.snapshot_utility( 'snapshot', '-type=full') log_print('Snapshot completed', color='debug') sorted_cache_names = [] for cache_name in piclient_old.get_ignite().cacheNames( ).toArray(): sorted_cache_names.append(cache_name) sorted_cache_names.sort() async_operations = [] cache_operation = {} cache_checksum = {} for cache_name in sorted_cache_names: async_operation = create_async_operation( create_checksum_operation, cache_name, 1, 10000) async_operations.append(async_operation) cache_operation[async_operation] = cache_name async_operation.evaluate() checksums = '' for async_operation in async_operations: result = str(async_operation.getResult()) cache_checksum[cache_operation.get( async_operation)] = result checksums += result log_print(checksums, color='debug') self.ignite_new_version.su.snapshot_utility( 'restore', '-id={}'.format( self.ignite_new_version.su.get_created_snapshot_id(1))) log_print('Test completed', color='debug')
def test_24_fitness_set_baseline_with_properties(self): """ This test checks the cluster behaviour with option GG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR that could be set in different ways: 1. Set at one of the server nodes. 2. Set on some client node/nodes. """ created_caches = [] self.ignite_old_version.cu.activate() # Preloading with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(20) new_client_config = Ignite.config_builder.get_config( 'client', config_set_name='24_fit_with_consist_id') jvm_options = self.ignite_new_version.get_jvm_options(1) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=true') # with PiClient(self.ignite_new_version, self.client_config, jvm_options=jvm_options, nodes_num=1) as piclient: with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: for i in range(1, 5): self.ignite_old_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') jvm_options = self.ignite_new_version.get_jvm_options(i) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=false' ) self.ignite_new_version.set_node_option( '*', 'config', Ignite.config_builder.get_config( 'server', config_set_name='24_fit_with_consist_id')) log_print("Starting node {} with new consistent id".format(i), color='debug') self.ignite_new_version.start_nodes(i, already_nodes=4, other_nodes=4, timeout=240) log_print("Changing baseline", color='debug') self.ignite_old_version.cu.set_current_topology_as_baseline() util_sleep_for_a_while(60, msg='Wait for rebalance to completed') log_print('Test is done')
def test_24_fitness_rolling_upgrade(self): """ This test checks the main rolling upgrade scenario under the load: 1. Old cluster up and running (consistent_id's are not set). 2. First cycle (upgrade to new version and set property GG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR): 3. Second cycle (set correct consistent_id with adding to baseline topology). """ created_caches = [] self.ignite_old_version.cu.activate() with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(60) with PiClient(self.ignite_old_version, self.client_config, nodes_num=4) as piclient: cache_names = piclient.get_ignite().cacheNames() # Start transaction loading for TTL caches with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.client_config, loading_profile=LoadingProfile( delay=0, transaction_timeout=100000, run_for_seconds=600)): util_sleep_for_a_while(20) log_print('Rolling upgrade', color='green') async_ops = [] for cache_name in [ cache_name for cache_name in cache_names.toArray() if cache_name.startswith("M2_PRODUCT") ]: async_operation = create_async_operation( create_put_all_operation, cache_name, 1001, 400001, 10, value_type=ModelTypes.VALUE_ALL_TYPES.value) async_ops.append(async_operation) async_operation.evaluate() # First cycle: upgrade version and set property. for i in range(1, 5): self.ignite_old_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') self.ignite_old_version.kill_nodes(i) self.ignite_new_version.cleanup_work_dir(i) folder = self.ignite_old_version.get_work_dir(i) log_print(folder, color='debug') self.ignite_new_version.copy_work_dir_from(i, folder) jvm_options = self.ignite_new_version.get_jvm_options(i) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=true' ) util_sleep_for_a_while(10) self.ignite_new_version.start_nodes(i, already_nodes=(4 - i), other_nodes=(4 - i), timeout=240) self.ignite_new_version.cu.control_utility('--baseline') for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(30) log_print('Change consistent ID', color='green') self.ignite_new_version.set_node_option( '*', 'config', Ignite.config_builder.get_config( 'server', config_set_name='24_fit_with_consist_id')) # Second cycle - change consistent_id and add to baseline topology. for i in range(1, 5): self.ignite_new_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') self.ignite_new_version.kill_nodes(i) log_print( "Starting node {} with new consistent id".format(i), color='debug') self.ignite_new_version.start_nodes(i, timeout=240) log_print("Changing baseline", color='debug') self.ignite_new_version.cu.set_current_topology_as_baseline( ) util_sleep_for_a_while( 60, msg='Wait for rebalance to completed') log_print('Transactional loading done', color='green') # Just to check client node still can interact with cluster - calculate checksum from client node. sorted_cache_names = [] for cache_name in piclient.get_ignite().cacheNames().toArray(): sorted_cache_names.append(cache_name) sorted_cache_names.sort() async_operations = [] cache_operation = {} for cache_name in sorted_cache_names: async_operation = create_async_operation( create_checksum_operation, cache_name, 1, 10000) async_operations.append(async_operation) cache_operation[async_operation] = cache_name async_operation.evaluate() checksums = '' cache_checksum = {} for async_operation in async_operations: result = str(async_operation.getResult()) cache_checksum[cache_operation.get(async_operation)] = result checksums += result log_print('Calculating checksums done')
def run(self, artifact_name): """ Run FST Data Replication scenario for defined artifact 1. start 2 clusters (master and replica) with replication on clients up and running. 2. stop senders to prevent replication. 3. load data to first N caches (check caches_amount variable in the test) with streamer on master. 4. start senders, start replication and make FST operation. 3. check the time used for caches to sync. :param artifact_name: name from artifact configuration file """ super().run(artifact_name) log_print("Running Data Replication benchmark with config: %s" % self.config, color='green') caches_amount = 4 try: # collect properties from config self.initialize_clusters() self.start_clusters(self.clusters) version = self.test_class.tiden.config['artifacts'][artifact_name]['ignite_version'] self.new_behaviour = self.if_new_behaviour(version, self.min_version) # run ignite app keys_to_load = int(self.config.get('keys_to_load')) with PiClient(self.master, self.master_client_config, nodes_num=1) as piclient_master: cache_names = piclient_master.get_ignite().cacheNames().toArray() cache_names = [cache_name for cache_name in cache_names] with PiClient(self.replica, self.replica_client_config, nodes_num=1, new_instance=True) as piclient_replica: time_results = list() async_operations = [] start_key, end_key, remove_probability = 0, keys_to_load, 0.0 for cache_name in cache_names[:caches_amount]: # kill senders to prevent replication senders = self.clusters[0].get_sender_nodes() for node in senders: self.master.kill_node(node.id) log_print(f'Uploading data into cache {cache_name}') async_operation = create_async_operation( create_streamer_operation, cache_name, start_key, end_key, value_type=ModelTypes.VALUE_ALL_TYPES.value, gateway=piclient_master.get_gateway(), ) async_operations.append(async_operation) async_operation.evaluate() for async_op in async_operations: async_op.getResult() log_print('Uploading is done', color='green') master_sizes = self.get_caches_size(cache_mask=lambda x: cache_name in x, piclient=piclient_master, debug=False) replica_sizes = self.get_caches_size(cache_mask=lambda x: cache_name in x, piclient=piclient_replica, debug=False) log_print('Master size={}, replica size={}'.format(master_sizes, replica_sizes)) # start senders senders_ids = [node.id for node in senders] self.master.start_additional_nodes(senders_ids, client_nodes=True, already_started=1, other_nodes=1) log_print(f'Running State transfer for cache {cache_name}', color='blue') status = piclient_master.get_ignite().plugin("GridGain").dr().senderCacheStatus(cache_name) log_print(f'DR status {status} for cache {cache_name}') # start probes self.start_probes(artifact_name, self.master.name) start_time = datetime.now() # make FST (new behaviour - use resume operation instead of start replication) if self.new_behaviour: piclient_master.get_ignite().plugin("GridGain").dr().startReplication(cache_name) else: piclient_master.get_ignite().plugin("GridGain").dr().resume(cache_name) piclient_master.get_ignite().plugin("GridGain").dr().stateTransfer(cache_name, bytes([2])) self._wait_for_same_caches_size(piclient_master, piclient_replica, predicate=lambda x: cache_name in x) replication_time = (datetime.now() - start_time).seconds log_print(f'Replication time {replication_time}') time_results.append(replication_time) self.stop_probes(time_results=time_results, seconds=True) log_print() self.results['evaluated'] = True finally: if self.clusters: for cluster in self.clusters: log_print('Teardown for cluster {}'.format(cluster)) if cluster.grid: cluster.grid.jmx.kill_utility() cluster.grid.remove_additional_nodes() cluster.grid.kill_nodes() cluster.grid.delete_lfs() cluster.grid = None
def run(self, artifact_name): """ Run scenario for defined artifact :param artifact_name: name from artifact configuration file """ super().run(artifact_name) log_print("Running putAll() benchmark with config: %s" % self.config, color='green') caches_list_file = 'caches_%s.xml' % PUT_ALL_CONFIG_SET \ if not self.config.get('many_parts') else 'caches_many_parts.xml' print_detailed_cache_info = self.config.get( 'print_detailed_cache_info') version = self.test_class.tiden.config['artifacts'][artifact_name][ 'ignite_version'] try: self.test_class.create_app_config_set( Ignite, PUT_ALL_CONFIG_SET, caches_list_file=caches_list_file, deploy=True, logger=False, wal_segment_size=self.test_class.consumption_config.get( 'wal_segment_size', 64 * 1024 * 1024), logger_path='%s/ignite-log4j2.xml' % self.test_class.tiden.config['rt']['remote'] ['test_module_dir'], disabled_cache_configs=False, zookeeper_enabled=False, checkpoint_read_lock_timeout=self.read_lock_property_value( version), # caches related variables additional_configs=[ 'caches.tmpl.xml', ], part_32=self.test_class.consumption_config.get('part_32', 32), part_64=self.test_class.consumption_config.get('part_64', 64), part_128=self.test_class.consumption_config.get( 'part_64', 128), ) version, ignite = self.test_class.start_ignite_grid( artifact_name, activate=True, config_set=PUT_ALL_CONFIG_SET) self.start_probes(artifact_name) warmup_runs, prod_runs = self._get_number_of_runs() time_results = list() per_cache_results = {} client_config = Ignite.config_builder.get_config( 'client', config_set_name=PUT_ALL_CONFIG_SET) with PiClient(ignite, client_config) as piclient: cache_names = piclient.get_ignite().cacheNames() data_size = int(self.config.get('data_size')) log_print("Running {} iterations".format(warmup_runs + prod_runs)) for i in range(0, warmup_runs + prod_runs): # print message to all nodes log if i == warmup_runs: create_message_operation( 'Checkpoint started PRODUCTION RUN STARTED' ).evaluate() self.write_time_event('iteration_%s start' % i) warmup_iteration = False if warmup_runs == 0 else i < warmup_runs log_print("Running iteration %s (%s)" % (i, 'warmup' if warmup_iteration else 'prod')) log_print("Loading %s values per cache into %s caches" % (data_size * (i + 1) - data_size * i, cache_names.size())) async_operations = {} self.write_time_event('iteration_%s create putall' % i) for cache_name in cache_names.toArray(): async_operation = create_async_operation( create_put_all_operation, cache_name, data_size * i, data_size * (i + 1), int(self.config.get('put_all_batch_size')), value_type=ModelTypes.VALUE_ACCOUNT.value) async_operations[cache_name] = async_operation async_operation.evaluate() for cache_name, async_op in async_operations.items(): async_op.getResult() # skip first operations as warmup if not warmup_iteration: loading_time = async_op.getOperation().getEndTime( ) - async_op.getOperation().getStartTime() if cache_name in per_cache_results: per_cache_results[cache_name] += loading_time else: per_cache_results[cache_name] = loading_time time_results.append(loading_time) self.write_time_event('iteration_%s putall done' % i) log_print("Loading done") if print_detailed_cache_info: log_print("Per cache results:") for key in sorted(per_cache_results.keys()): print("%s: %s" % (key, per_cache_results[key])) ignite.cu.deactivate() self.stop_probes(time_results=time_results) self.results['evaluated'] = True ignite.kill_nodes() ignite.delete_lfs() log_put("Cleanup Ignite LFS ... ") commands = {} for node_idx in ignite.nodes.keys(): host = ignite.nodes[node_idx]['host'] if commands.get(host) is None: commands[host] = [ 'rm -rf %s/work/*' % ignite.nodes[node_idx]['ignite_home'] ] else: commands[host].append( 'rm -rf %s/work/*' % ignite.nodes[node_idx]['ignite_home']) results = self.test_class.tiden.ssh.exec(commands) print(results) log_put("Ignite LFS deleted.") log_print() finally: # remove config set self.test_class.remove_app_config_set(Ignite, PUT_ALL_CONFIG_SET)
def run(self, artifact_name): """ Run Data Replication scenario for defined artifact Scenario is very simple 1. start 2 clusters (master and replica) with replication on clients up and running. 2. load data to first N caches (check caches_amount variable in the test) with streamer on master. 3. check the time used for caches to sync. :param artifact_name: name from artifact configuration file """ super().run(artifact_name) log_print("Running Data Replication benchmark with config: %s" % self.config, color='green') caches_amount = 4 try: # collect properties from config self.initialize_clusters() self.start_clusters(self.clusters) # run ignite app keys_to_load = int(self.config.get('keys_to_load')) with PiClient(self.master, self.master_client_config, nodes_num=1) as piclient_master: cache_names = piclient_master.get_ignite().cacheNames().toArray() cache_names = [cache_name for cache_name in cache_names] with PiClient(self.replica, self.replica_client_config, nodes_num=1, new_instance=True) as piclient_replica: time_results = list() self.start_probes(artifact_name, self.master.name) async_operations = [] start_key, end_key, remove_probability = 0, keys_to_load, 0.0 for cache_name in cache_names[:caches_amount]: log_print(f'Uploading data into cache {cache_name}') start_time = datetime.now() async_operation = create_async_operation( create_streamer_operation, cache_name, start_key, end_key, value_type=ModelTypes.VALUE_ALL_TYPES.value, gateway=piclient_master.get_gateway(), ) async_operations.append(async_operation) async_operation.evaluate() for async_op in async_operations: async_op.getResult() log_print('Uploading is done', color='green') self._wait_for_same_caches_size(piclient_master, piclient_replica, predicate=lambda x: cache_name in x) replication_time = (datetime.now() - start_time).seconds log_print(f'Replication time {replication_time}') time_results.append(replication_time) self.stop_probes(time_results=time_results, seconds=True) log_print() self.results['evaluated'] = True finally: if self.clusters: for cluster in self.clusters: log_print('Teardown for cluster {}'.format(cluster)) if cluster.grid: cluster.grid.jmx.kill_utility() cluster.grid.remove_additional_nodes() cluster.grid.kill_nodes() cluster.grid.delete_lfs() cluster.grid = None
def do_blink_backups_under_load(self, initial_remove_probability): iteration_size = self.config.get('iteration_size', 80000) start = 0 keep_coordinator_busy = True start_value = 0 # temporary save LFS even on test pass self.need_delete_lfs_on_teardown = False first_node = self.ignite.get_node_consistent_id(1) second_node = self.ignite.get_node_consistent_id(2) if keep_coordinator_busy: other_nodes = list( set(self.ignite.get_all_default_nodes()) - set([1])) else: other_nodes = list( set(self.ignite.get_all_default_nodes()) - set([1, 2])) current_server_num = self.ignite.get_nodes_num('server') tx_caches = [] atomic_caches = [] self.ignite.set_snapshot_timeout(600) with PiClient(self.ignite, self.get_client_config(), nodes_num=1) as piclient: gateway = piclient.get_gateway() ignite = piclient.get_ignite() cache_names = ignite.cacheNames().toArray() for cache_name in cache_names: # run cross cache transfer task only for transactional caches if ignite.getOrCreateCache(cache_name).getConfiguration( gateway.jvm.org.apache.ignite.configuration. CacheConfiguration().getClass()).getAtomicityMode( ).toString() == 'TRANSACTIONAL': tx_caches.append(cache_name) else: atomic_caches.append(cache_name) PiClientIgniteUtils.wait_for_running_clients_num(self.ignite, 0, 120) for iteration in range(0, self.iterations): log_print("Iteration {}/{}".format(str(iteration + 1), str(self.iterations)), color='blue') start_key = start + iteration * iteration_size end_key = start_key + iteration_size if initial_remove_probability > 0.0: remove_probability = initial_remove_probability + iteration / self.iterations / 2.0 else: remove_probability = 0.0 current_client_num = self.ignite.get_nodes_num('client') for i in range(0, 3): with PiClient(self.ignite, self.get_client_config()) as piclient: log_print( "Loading (remove {probability}%) {load} values per cache into {n_caches} caches" .format( probability=remove_probability, load=iteration_size, n_caches=len(tx_caches), )) async_operations = [] for cache_name in tx_caches: node_id = piclient.get_node_id() gateway = piclient.get_gateway(node_id) tx_size = randint(1, 10) log_print( "Client {node_id} -> {cache_name}, tx size {tx_size}" .format( node_id=node_id, cache_name=cache_name, tx_size=tx_size, removeProbability=remove_probability, )) async_operation = create_async_operation( create_put_with_optional_remove_operation, cache_name, start_key, end_key, remove_probability, gateway=gateway, node_consistent_id=first_node if keep_coordinator_busy else second_node, tx_description=TxDescriptor( concurrency='PESSIMISTIC', isolation='REPEATABLE_READ', size=tx_size), use_monotonic_value=True, monotonic_value_seed=start_value, ) start_value = start_value + iteration_size async_operations.append(async_operation) async_operation.evaluate() # little warm up util_sleep(5) node_id = self.ignite.get_random_server_nodes( 1, node_ids=other_nodes)[0] self.ignite.kill_node(node_id) self.ignite.wait_for_topology_snapshot( server_num=current_server_num - 1) # continue load data during node offline util_sleep(15) self.ignite.start_node(node_id) self.ignite.wait_for_topology_snapshot( server_num=current_server_num) PiClientIgniteUtils.wait_for_running_clients_num( self.ignite, current_client_num, 120) self.wait_transactions_finish() self.ignite.jmx.wait_for_finish_rebalance( self.rebalance_timeout, self.group_names) self.idle_verify_check_conflicts_action() self.idle_verify_dump_action()
def run(self, artifact_name): """ Run scenario for defined artifact :param artifact_name: name from artifact configuration file """ super().run(artifact_name) log_print("Running putAll() benchmark with config: %s" % self.config, color='green') version, ignite = self.test_class.start_ignite_grid(artifact_name, activate=True) self.start_probes(artifact_name) warmup_runs, prod_runs = self._get_number_of_runs() time_result = 0 with PiClient(ignite, self.test_class.client_config) as piclient: cache_names = piclient.get_ignite().cacheNames() data_size = int(self.config.get('data_size')) log_print("Running {} iterations".format(warmup_runs + prod_runs)) for i in range(0, warmup_runs + prod_runs): self.write_time_event('iteration_%s start' % i) warmup_iteration = False if warmup_runs == 0 else i < warmup_runs log_print("Running iteration %s (%s)" % (i, 'warmup' if warmup_iteration else 'prod')) log_print("Loading %s values per cache into %s caches" % (data_size * (i + 1) - data_size * i, cache_names.size())) async_operations = [] self.write_time_event('iteration_%s create putall' % i) for cache_name in cache_names.toArray(): async_operation = create_async_operation( create_put_all_operation, cache_name, data_size * i, data_size * (i + 1), int(self.config.get('put_all_batch_size')), value_type=ModelTypes.VALUE_ACCOUNT.value) async_operations.append(async_operation) async_operation.evaluate() for async_op in async_operations: async_op.getResult() # skip first operations as warmup if not warmup_iteration: time_result += async_op.getOperation().getEndTime( ) - async_op.getOperation().getStartTime() self.write_time_event('iteration_%s putall done' % i) log_print("Loading done") ignite.cu.deactivate() self.stop_probes(time_results=float(time_result) / prod_runs) self.results['evaluated'] = True ignite.kill_nodes() ignite.delete_lfs() log_put("Cleanup Ignite LFS ... ") commands = {} for node_idx in ignite.nodes.keys(): host = ignite.nodes[node_idx]['host'] if commands.get(host) is None: commands[host] = [ 'rm -rf %s/work/*' % ignite.nodes[node_idx]['ignite_home'] ] else: commands[host].append('rm -rf %s/work/*' % ignite.nodes[node_idx]['ignite_home']) results = self.test_class.tiden.ssh.exec(commands) print(results) log_put("Ignite LFS deleted.") log_print()
def run(self, artifact_name): """ Run scenario for defined artifact :param artifact_name: name from artifact configuration file """ super().run(artifact_name) log_print("Running streamer() benchmark with config: %s" % self.config, color='green') version = self.test_class.tiden.config['artifacts'][artifact_name][ 'ignite_version'] try: xml_config_set_name = 'caches_%s.xml' % STREAMER_CONFIG_SET self.test_class.create_app_config_set( Ignite, STREAMER_CONFIG_SET, caches_list_file=xml_config_set_name, deploy=True, logger=False, wal_segment_size=self.test_class.consumption_config.get( 'wal_segment_size', 64 * 1024 * 1024), logger_path='%s/ignite-log4j2.xml' % self.test_class.tiden.config['rt']['remote'] ['test_module_dir'], disabled_cache_configs=False, zookeeper_enabled=False, checkpoint_read_lock_timeout=self.read_lock_property_value( version), # caches related variables additional_configs=[ 'caches.tmpl.xml', ], part_32=self.test_class.consumption_config.get('part_32', 32), part_64=self.test_class.consumption_config.get('part_64', 64), part_128=self.test_class.consumption_config.get( 'part_64', 128), ) version, ignite = self.test_class.start_ignite_grid( artifact_name, activate=True, config_set=STREAMER_CONFIG_SET) self.start_probes(artifact_name) # default times to run # plus warmup times # plus rerun times warmup_runs, prod_runs = self._get_number_of_runs() time_results = list() client_config = Ignite.config_builder.get_config( 'client', config_set_name=STREAMER_CONFIG_SET) with PiClient(ignite, client_config) as piclient: cache_names = piclient.get_ignite().cacheNames() data_size = int(self.config.get('data_size')) log_print("Running {} iterations".format(warmup_runs + prod_runs)) for i in range(0, warmup_runs + prod_runs): self.write_time_event('iteration_%s start' % i) warmup_iteration = False if warmup_runs == 0 else i < warmup_runs log_print("Running iteration %s (%s)" % (i, 'warmup' if warmup_iteration else 'prod')) log_print("Loading %s values per cache into %s caches" % (data_size * (i + 1) - data_size * i, cache_names.size())) async_operations = [] self.write_time_event('iteration_%s create streamer' % i) for cache_name in cache_names.toArray(): async_operation = create_async_operation( create_streamer_operation, cache_name, data_size * i, data_size * (i + 1), value_type=ModelTypes.VALUE_ACCOUNT.value) async_operations.append(async_operation) async_operation.evaluate() for async_op in async_operations: async_op.getResult() # skip first operations as warmup otherwise write results if not warmup_iteration: time_results.append( async_op.getOperation().getEndTime() - async_op.getOperation().getStartTime()) self.write_time_event('iteration_%s streamer done' % i) log_print("Loading done") ignite.cu.deactivate() self.stop_probes(time_results=time_results) self.results['evaluated'] = True ignite.kill_nodes() ignite.delete_lfs() log_put("Cleanup Ignite LFS ... ") commands = {} for node_idx in ignite.nodes.keys(): host = ignite.nodes[node_idx]['host'] if commands.get(host) is None: commands[host] = [ 'rm -rf %s/work/*' % ignite.nodes[node_idx]['ignite_home'] ] else: commands[host].append( 'rm -rf %s/work/*' % ignite.nodes[node_idx]['ignite_home']) results = self.test_class.tiden.ssh.exec(commands) print(results) log_put("Ignite LFS deleted.") log_print() finally: # remove config set self.test_class.remove_app_config_set(Ignite, STREAMER_CONFIG_SET)
def cluster_state_data(self, logs=True): """ Save previous cluster state Restore if some nodes are killed Add some data Create tables :param logs: log in console :return: setup state True - cluster started as new False - cluster restored """ tables_count = 10 tables_rows_count = 100 put_count = 100 batch_size = 100 if logs: log_print('[cluster state] staring DATA', color='yellow') self.clients_go_away() self.was_snapshots() # for all alive nodes trying to restore topology if [ node for node in self.cluster.nodes.values() if node['status'] != NodeStatus.NEW ] and not self.context['clean_cluster_was_here']: data_changed = self._data_changed() try: # kill and delete all additional nodes nodes_to_kill = [ node_id for node_id in self.cluster.nodes.keys() if node_id > 100 ] for node_id in nodes_to_kill: self.cluster.kill_node(node_id, ignore_exceptions=True) self.cluster.cleanup_work_dir(node_id) del self.cluster.nodes[node_id] log_print(f'delete {node_id}') if nodes_to_kill: active_nodes = [ node for node_id, node in self.cluster.nodes.items() if node_id < 100 and node['status'] == NodeStatus.STARTED ] self.cluster.wait_for_topology_snapshot( server_num=len(active_nodes), timeout=80, check_only_servers=True) nodes_started = 0 for node_id, node in self.cluster.nodes.items(): # start all killed nodes if node['status'] != NodeStatus.STARTED: if data_changed: # clean node data if data in cluster changed without node self.cluster.cleanup_work_dir(node_id) self.cluster.start_node(node_id, force=True) nodes_started += 1 if nodes_started == len(self.cluster.nodes): # if all nodes killed already self.cluster.cu.activate() self.context['activate'] = True baseline_nodes = self.cluster.cu.get_current_baseline() # update BLT if changed if len(baseline_nodes) != len(self.cluster.nodes) or \ [stat for stat in baseline_nodes.values() if stat != 'ONLINE']: self.cluster.cu.set_current_topology_as_baseline( strict=True, **self.control_util_ssh_options) if logs: log_print('[cluster state] started DATA (restored)', color='yellow') # add tables with PiClient(self.cluster, self.client_config, new_instance=True, name='setup', read_timeout=60 * 10) as piclient: assert create_tables_operation( tables_count, tables_rows_count, gateway=piclient.get_gateway()).evaluate( ), 'Restore tables data operation has failed' return False except: # clean cluster if something wrong and start from start stacktrace = format_exc() log_print(stacktrace, color='red') self.context['step_failed'] = stacktrace try: self.save_fail('restore') except: log_print('failed to save restore data') self.context[ 'step_failed'] += f'\n\nFailed to save restore data\n{format_exc()}' self.clean_cluster() self.start_grid() with PiClient(self.cluster, self.client_config, new_instance=True, name='setup', read_timeout=60 * 10) as piclient: # put data start_counter = self._set_key_iter(put_count) operations = [] caches_names = list(piclient.get_ignite().cacheNames().toArray()) log_print( f'[setup data] put {put_count} keys in each of {len(caches_names)} caches' ) for cache_name in caches_names: op = create_async_operation(create_put_all_operation, cache_name=cache_name, start=start_counter, end=start_counter + put_count, batch_size=batch_size) operations.append(op) op.evaluate() for op in operations: op.getResult() # create tables log_print( f'[setup data] create {tables_count} tables with {tables_rows_count} rows in each' ) assert create_tables_operation( tables_count, tables_rows_count, gateway=piclient.get_gateway()).evaluate( ), 'Create tables with operations has failed' if logs: log_print(f'[cluster state] started DATA (as new)', color='yellow') return True
def _run_operation(self, property_name, class_path): """ Run basic operations 1) Start operation 2) Wait till starting string appear in piclient logs 3) Wait till kill command was coming 4) Kill piclient :param property_name: operation name :param class_path: operation class name """ # handle lock try: with PiClient(self.cluster, self.client_config, new_instance=True, name=property_name, read_timeout=60 * 10) as piclient: # free lock self.operations[property_name]['started'] = False self.operations[property_name]['kill'] = False try: # start async log_print( f'[{property_name}] piclient started, begin operation') async_operation = create_async_operation( create_combine_operation, class_path, gateway=piclient.get_gateway()) async_operation.evaluate() log_print(f'[{property_name}] operation started') if not self.cluster.wait_for_messages_in_log( piclient.node_ids[0], f'{property_name} operation started', timeout=50, fail_pattern= f'(\\[{property_name}\\] Failed|\(err\) Failed)'): # kill operation if not found # or some exception thrown faster log_print( f'Failed to wait {property_name} operation start in logs', color='red') log_print(f'[operation] {property_name.upper()} kill', color='yellow') self.operations[property_name]['kill'] = True self.operations[property_name]['killed'] = True return if not self.operations[property_name]['started']: log_print(f'[{property_name}] failed logs not found') self.operations[property_name]['started'] = True except: log_print(format_exc(), color='red') self.operations[property_name]['kill'] = True self.operations[property_name]['killed'] = True log_print(f'[operation] {property_name.upper()} kill', color='yellow') return end_time = time() + self.max_operation_timeout while True: # wait for kill if self.operations[property_name]['kill']: self.operations[property_name]['killed'] = True log_print(f'[operation] {property_name.upper()} kill', color='yellow') return if time() > end_time: if self.operations.get(property_name): self.operations[property_name]['killed'] = True log_print( f'[operation] {property_name.upper()} timeout kill', color='yellow') return sleep(0.5) except: log_print(f'[{property_name}] fail happened, killing operation') if self.operations.get(property_name): self.operations[property_name]['kill'] = True self.operations[property_name]['killed'] = True raise
def load_amount_of_data(self, ignite, node, start_key, keys_to_load): """ Tricky method that loads data into a cluster. The main idea here is to load data that will be rebalanced. Other data will not be written into a cluster. So for example: we collect rebalance time by killing node_2, in this case we load data that will be contains primary and backup partitions on node_2. Data that should not appear on node_2 will not be loaded. Also there is a case with cache that shouldn't be rebalanced, but used in TransactionalLoading. This cache needed to measure rebalance influence to a cache that not in rebalance. :param ignite: ignite instance :param node: node to rebalance :param start_key: start key to load :param keys_to_load: number os keys that should be loaded (NOT END_KEY!) :return: """ client_config = Ignite.config_builder.get_config('client', config_set_name=REBALANCE_CONFIG_SET) log_print("Just load data from %s with %s affinity keys on node %s" % (start_key, keys_to_load, NODE_TO_REBALANCE)) with PiClient(ignite, client_config, nodes_num=2 if self.with_no_rebalance_cache else 1) as piclient: async_operations = [] # load cache that will not be rebalanced but with loading if self.with_loading and self.with_no_rebalance_cache: gateway = piclient.get_gateway() log_print("Loading cache that will not be included into rebalance", color='red') operation_no_rebalance = create_async_operation( create_streamer_operation, CACHE_NAME_NOT_IN_REBALANCE, start_key, keys_to_load, value_type=self.load_type, gateway=gateway, ) # here we define AffinityCountKeyGenerator to a loading operation # this specific loading allow us to measure loading to a non rebalanced cache operation_no_rebalance.getOperation().setKeyGenerator( AffinityCountKeyGeneratorBuilder( CACHE_NAME_NOT_IN_REBALANCE, ignite.get_node_consistent_id(node), start_key, keys_to_load, False ).build() ) operation_no_rebalance.evaluate() async_operations.append(operation_no_rebalance) gateway = piclient.get_gateway() # load cache that will be rebalanced log_print("Loading cache that will be rebalanced", color='red') operation = create_async_operation( create_streamer_operation, CACHE_NAME, start_key, keys_to_load, value_type=self.load_type, gateway=gateway, ) if self.parts_distribution: # this is specific "all data in on partition case" operation.getOperation().setKeyGenerator( AffinityPartitionKeyGeneratorBuilder( CACHE_NAME, self.parts_distribution, start_key, keys_to_load, ).build() ) else: # default partition distribution operation.getOperation().setKeyGenerator( AffinityCountKeyGeneratorBuilder( CACHE_NAME, ignite.get_node_consistent_id(node), start_key, keys_to_load, True ).build() ) operation.evaluate() async_operations.append(operation) for async_operation in async_operations: async_operation.getResult() last_key = operation.getResult() return last_key
def test_cycling_restart_grid_dynamic_caches_with_atomic_on_restart(self): """ Scenario The Glue (Assertions should be enabled) 1. Start grid, load some data 2. In the loop: 2.1 define node restart timeout (0.5 - 2.0 seconds) 2.2 Load more data 2.3 Restart each node with defined timeout (DOES NOT LOOK ON TOPOLOGY SNAPSHOT) 2.4 Try to activate, check AssertionErrors 2.5 Try to baseline (If 2 operation failed -> PME, kill all nodes, start new test iteration) 2.6 Try to load data 2.7 Try to calculate checksum :return: """ import random PiClient.read_timeout = 240 # sleep_for_time = float(random.randrange(1, 15, 1)) / 5 self.set_current_context('in_memory') self.util_copy_piclient_model_to_libs() self.ignite.set_activation_timeout(240) self.ignite.set_snapshot_timeout(240) self.ignite.set_node_option('*', 'jvm_options', ['-ea']) self.su.clear_snapshots_list() self.start_grid(skip_activation=True) with PiClient(self.ignite, self.get_client_config(), jvm_options=['-ea']) as piclient: # ignite = piclient.get_ignite() self.start_dynamic_caches_with_node_filter() last_loaded_key = 1000 PiClientIgniteUtils.load_data_with_streamer( self.ignite, self.get_client_config(), end_key=last_loaded_key, jvm_options=['-ea']) nodes_before = self.ignite.get_alive_default_nodes() iterations = 50 last_loaded_key += 1 for i in range(0, iterations): log_print('Current iteration %s from %s' % (i, iterations), color='debug') # sleep_for_time = float(self.the_glue_timeout) if self.the_glue_timeout else random.choice([0.7, 0.9, 2.0]) sleep_for_time = float( self.the_glue_timeout) if self.the_glue_timeout else round( random.uniform(0.5, 2.5), 1) log_print( "In this run we are going to sleep for {} seconds after each node restart" .format(sleep_for_time), color='green') log_print('Trying to load data into created/existing caches', color='yellow') self.start_dynamic_caches_with_node_filter() PiClientIgniteUtils.load_data_with_streamer( self.ignite, self.get_client_config(), start_key=last_loaded_key, end_key=last_loaded_key + 500, jvm_options=['-ea']) last_loaded_key += 500 log_print("Round restart") for node_id in self.ignite.get_alive_default_nodes(): self.ignite.kill_node(node_id) self.ignite.start_node(node_id, skip_topology_check=True) sleep(sleep_for_time) try: log_print( "Incrementing atomics using distributed compute") create_async_operation( create_distributed_atomic_long).evaluate() except Exception as e: log_print("Failed to increment atomics") # just print exception (https://issues.apache.org/jira/browse/IGNITE-11535) traceback.print_exc() log_print("Wait for topology messages") for node_id in self.ignite.get_all_default_nodes(): self.ignite.update_started_node_status(node_id) sleep(15) log_print("Validating cluster") last_loaded_key = self.verify_cluster(nodes_before, last_loaded_key)
def test_ignite_8657(self): """ This test is based on IGNITE-8657: 1. start grid with EXCHANGE_HISTORY_SIZE smaller than N 2. activate 3. start simultaneously M > N clients 4. all client nodes should and be able to perform cache put/get operations and transactions NB: this test hangs with 2.5.1-p6, due to piclient wait Ignition.start() forever """ self.start_grid() self.load_random_data_with_streamer(0, 1000, nodes_num=2) self.cu.set_current_topology_as_baseline() nodes_before = set(self.ignite.get_all_common_nodes()) with PiClient(self.ignite, self.get_client_config(), nodes_num=10, jvm_options=self.jvm_options, read_timeout=300) as piclient: nodes_after = set(self.ignite.get_all_common_nodes()) nodes_started = list(nodes_after - nodes_before) node_ids = deque(nodes_started) node_id = node_ids[0] node_ids.rotate() for i in range(1, 5): gateway = piclient.get_gateway(node_id) ignite = piclient.get_ignite(node_id) tx = ignite.transactions().txStart() util_sleep_for_a_while(3) tx.commit() for concurrency in ['OPTIMISTIC', 'PESSIMISTIC']: for isolation in [ 'READ_COMMITTED', 'REPEATABLE_READ', 'SERIALIZABLE' ]: print_blue('Run transaction %s %s' % (concurrency, isolation)) node_id = node_ids[0] node_ids.rotate() gateway = piclient.get_gateway(node_id) ignite = piclient.get_ignite(node_id) concurrency_isolation_map = self._get_tx_type_map( gateway) cache_names = ignite.cacheNames().toArray() tx = ignite.transactions().txStart( concurrency_isolation_map.get(concurrency), concurrency_isolation_map.get(isolation)) for cache_name in cache_names: cache = ignite.getCache(cache_name) val = cache.get(int(random() * 1000)) # log_print('got %s' % repr(val)) if val: cache.put(int(random() * 1000), val) tx.commit() node_id = node_ids[0] node_ids.rotate() ignite = piclient.get_ignite(node_id) async_ops = [] for cache_name in ignite.cacheNames().toArray(): _async = create_async_operation(create_streamer_operation, cache_name, 1002, 2000) _async.evaluate() async_ops.append(_async) for async_op in async_ops: async_op.getResult()