def __run(self, ignite_version, trigger_event, backups, cache_count, entry_count, entry_size, preloaders, thread_pool_size, batch_size, batches_prefetch_count, throttle): """ Test performs rebalance test which consists of following steps: * Start cluster. * Put data to it via IgniteClientApp. * Triggering a rebalance event and awaits for rebalance to finish. :param ignite_version: Ignite version. :param trigger_event: Trigger event. :param backups: Backup count. :param cache_count: Cache count. :param entry_count: Cache entry count. :param entry_size: Cache entry size. :param preloaders: Preload application nodes count. :param thread_pool_size: rebalanceThreadPoolSize config property. :param batch_size: rebalanceBatchSize config property. :param batches_prefetch_count: rebalanceBatchesPrefetchCount config property. :param throttle: rebalanceThrottle config property. :return: Rebalance and data preload stats. """ reb_params = RebalanceParams( trigger_event=trigger_event, backups=backups, cache_count=cache_count, entry_count=entry_count, entry_size=entry_size, preloaders=preloaders, thread_pool_size=thread_pool_size, batch_size=batch_size, batches_prefetch_count=batches_prefetch_count, throttle=throttle) ignites = start_ignite(self.test_context, ignite_version, reb_params) preload_time = preload_data( self.test_context, ignites.config._replace( client_mode=True, discovery_spi=from_ignite_cluster(ignites)), rebalance_params=reb_params) if trigger_event: ignites.stop_node(ignites.nodes[-1]) rebalance_nodes = ignites.nodes[:-1] else: ignite = IgniteService( self.test_context, ignites.config._replace( discovery_spi=from_ignite_cluster(ignites)), num_nodes=1) ignite.start() rebalance_nodes = ignite.nodes await_rebalance_start(ignite) ignite.await_rebalance() return get_result(rebalance_nodes, preload_time, cache_count, entry_count, entry_size)
def test_node_join(self, ignite_version, backups, cache_count, entry_count, entry_size, preloaders, thread_pool_size, batch_size, batches_prefetch_count, throttle): """ Tests rebalance on node join. """ reb_params = RebalanceParams( trigger_event=TriggerEvent.NODE_JOIN, backups=backups, cache_count=cache_count, entry_count=entry_count, entry_size=entry_size, preloaders=preloaders, thread_pool_size=thread_pool_size, batch_size=batch_size, batches_prefetch_count=batches_prefetch_count, throttle=throttle, persistent=True) ignites = start_ignite(self.test_context, ignite_version, reb_params) control_utility = ControlUtility(ignites) control_utility.activate() preload_time = preload_data( self.test_context, ignites.config._replace( client_mode=True, discovery_spi=from_ignite_cluster(ignites)), rebalance_params=reb_params) new_node = IgniteService( self.test_context, ignites.config._replace( discovery_spi=from_ignite_cluster(ignites)), num_nodes=1) new_node.start() control_utility.add_to_baseline(new_node.nodes) await_and_check_rebalance(new_node) nodes = ignites.nodes.copy() nodes.append(new_node.nodes[0]) result = get_result(new_node.nodes, preload_time, cache_count, entry_count, entry_size) control_utility.deactivate() self.logger.debug( f'DB size after rebalance: {get_database_size_mb(nodes, ignites.database_dir)}' ) return result
def start_cell_with_prepared_txs(self, version, cell_id, discovery_spi, modules, col_cnt=0, noncol_cnt=0, multi_cnt=0): """ Starts cell with prepared transactions. """ nodes = self.start_cell( version, ['-D' + CellularAffinity.ATTRIBUTE + '=' + cell_id], discovery_spi, modules, CellularAffinity.NODES_PER_CELL - 1) prepared_tx_streamer = IgniteApplicationService( # last server node at the cell. self.test_context, IgniteConfiguration(version=IgniteVersion(version), properties=self.properties(), failure_detection_timeout=self.FAILURE_DETECTION_TIMEOUT, discovery_spi=from_ignite_cluster(nodes) if discovery_spi is None else discovery_spi), java_class_name="org.apache.ignite.internal.ducktest.tests.cellular_affinity_test." "CellularPreparedTxStreamer", params={"cacheName": CellularAffinity.CACHE_NAME, "attr": CellularAffinity.ATTRIBUTE, "cell": cell_id, "colocatedTxCnt": col_cnt, "multiTxCnt": multi_cnt, "noncolocatedTxCnt": noncol_cnt}, jvm_opts=['-D' + CellularAffinity.ATTRIBUTE + '=' + cell_id], modules=modules, startup_timeout_sec=180) prepared_tx_streamer.start_async( ) # starts last server node and creates prepared txs on it. return nodes, prepared_tx_streamer
def __start_ignite_nodes(self, version, num_nodes, timeout_sec=60, join_cluster=None): config = IgniteConfiguration( cluster_state="INACTIVE", version=IgniteVersion(version), data_storage=DataStorageConfiguration( default=DataRegionConfiguration(name='persistent', persistence_enabled=True), regions=[ DataRegionConfiguration(name='in-memory', persistence_enabled=False, max_size=100 * 1024 * 1024) ])) if join_cluster: config._replace(discovery_spi=from_ignite_cluster(join_cluster)) servers = IgniteService(self.test_context, config=config, num_nodes=num_nodes, startup_timeout_sec=timeout_sec) servers.start() return servers
def test_simple_services_start_stop(self, ignite_version): """ Tests plain services start and stop (termitation vs self-terination). """ ignites = IgniteService( self.test_context, IgniteConfiguration(version=IgniteVersion(ignite_version)), num_nodes=1) ignites.start() client = IgniteService( self.test_context, IgniteClientConfiguration(version=IgniteVersion(ignite_version)), num_nodes=1) client.start() node1 = IgniteApplicationService( self.test_context, IgniteClientConfiguration( version=IgniteVersion(ignite_version), discovery_spi=from_ignite_cluster(ignites)), java_class_name= "org.apache.ignite.internal.ducktest.tests.self_test.TestKillableApplication", startup_timeout_sec=180) node2 = IgniteApplicationService( self.test_context, IgniteClientConfiguration( version=IgniteVersion(ignite_version), discovery_spi=from_ignite_cluster(ignites)), java_class_name= "org.apache.ignite.internal.ducktest.tests.self_test.TestSelfKillableApplication", startup_timeout_sec=180) node1.start() node2.run() node1.stop() client.stop() ignites.stop()
def test_node_left(self, ignite_version, backups, cache_count, entry_count, entry_size, preloaders, thread_pool_size, batch_size, batches_prefetch_count, throttle): """ Tests rebalance on node left. """ reb_params = RebalanceParams( trigger_event=TriggerEvent.NODE_LEFT, backups=backups, cache_count=cache_count, entry_count=entry_count, entry_size=entry_size, preloaders=preloaders, thread_pool_size=thread_pool_size, batch_size=batch_size, batches_prefetch_count=batches_prefetch_count, throttle=throttle, persistent=True) ignites = start_ignite(self.test_context, ignite_version, reb_params) control_utility = ControlUtility(ignites) control_utility.activate() preload_time = preload_data( self.test_context, ignites.config._replace( client_mode=True, discovery_spi=from_ignite_cluster(ignites)), rebalance_params=reb_params) self.logger.debug( f'DB size before rebalance: {get_database_size_mb(ignites.nodes, ignites.database_dir)}' ) node = ignites.nodes[-1] ignites.stop_node(node) assert ignites.wait_node(node) control_utility.remove_from_baseline([node]) await_and_check_rebalance(ignites) result = get_result(ignites.nodes[:-1], preload_time, cache_count, entry_count, entry_size) control_utility.deactivate() self.logger.debug( f'DB size after rebalance: {get_database_size_mb(ignites.nodes, ignites.database_dir)}' ) return result
def test_distribution(self, ignite_version): """ Tests Cellular Affinity scenario (partition distribution). """ cell1 = self.start_cell(ignite_version, ['-D' + CellularAffinity.ATTRIBUTE + '=1']) discovery_spi = from_ignite_cluster(cell1) cell2 = self.start_cell(ignite_version, ['-D' + CellularAffinity.ATTRIBUTE + '=2'], discovery_spi) cell3 = self.start_cell( ignite_version, ['-D' + CellularAffinity.ATTRIBUTE + '=XXX', '-DRANDOM=42'], discovery_spi) for cell in [cell1, cell2, cell3]: cell.await_started() ControlUtility(cell1).activate() checker = IgniteApplicationService( self.test_context, IgniteClientConfiguration( version=IgniteVersion(ignite_version), discovery_spi=from_ignite_cluster(cell1)), java_class_name= "org.apache.ignite.internal.ducktest.tests.cellular_affinity_test.DistributionChecker", params={ "cacheName": CellularAffinity.CACHE_NAME, "attr": CellularAffinity.ATTRIBUTE, "nodesPerCell": self.NODES_PER_CELL }) checker.run()
def __start_tx_app(self, version, servers, *, client_mode=True, **kwargs): app_params = { 'config': IgniteConfiguration(version=IgniteVersion(version), client_mode=client_mode, discovery_spi=from_ignite_cluster(servers)), 'java_class_name': 'org.apache.ignite.internal.ducktest.tests.control_utility' '.LongRunningTransactionsGenerator', 'params': kwargs } app = IgniteApplicationService(self.test_context, **app_params) app.start() return app
def snapshot_test(self, ignite_version): """ Basic snapshot test. """ version = IgniteVersion(ignite_version) ignite_config = IgniteConfiguration( version=version, data_storage=DataStorageConfiguration( default=DataRegionConfiguration(persistent=True)), metric_exporters={ 'org.apache.ignite.spi.metric.jmx.JmxMetricExporterSpi' }) nodes = IgniteService(self.test_context, ignite_config, num_nodes=self.available_cluster_size - 1) nodes.start() control_utility = ControlUtility(nodes) control_utility.activate() loader_config = IgniteConfiguration( client_mode=True, version=version, discovery_spi=from_ignite_cluster(nodes)) loader = IgniteApplicationService( self.test_context, loader_config, java_class_name= "org.apache.ignite.internal.ducktest.tests.snapshot_test.DataLoaderApplication", params={ "start": 0, "cacheName": self.CACHE_NAME, "interval": 500_000, "valueSizeKb": 1 })
def test_ignite_app_start_stop(self, ignite_version): """ Test that IgniteService and IgniteApplicationService correctly start and stop """ server_configuration = IgniteConfiguration( version=IgniteVersion(ignite_version)) ignite = IgniteService(self.test_context, server_configuration, num_nodes=1) client_configuration = server_configuration._replace( client_mode=True, discovery_spi=from_ignite_cluster(ignite)) app = IgniteApplicationService( self.test_context, client_configuration, java_class_name= "org.apache.ignite.internal.ducktest.tests.smoke_test.SimpleApplication" ) ignite.start() app.start() app.stop() ignite.stop()
def node_join_historical_test(self, ignite_version, backups, cache_count, entry_count, entry_size, preloaders, thread_pool_size, batch_size, batches_prefetch_count, throttle): """ Test historycal rebalance. """ preload_entries = 10_000 reb_params = RebalanceParams( trigger_event=TriggerEvent.NODE_JOIN, backups=backups, cache_count=cache_count, entry_count=entry_count, entry_size=entry_size, preloaders=preloaders, thread_pool_size=thread_pool_size, batch_size=batch_size, batches_prefetch_count=batches_prefetch_count, throttle=throttle, persistent=True, jvm_opts=[ '-DIGNITE_PDS_WAL_REBALANCE_THRESHOLD=0', '-DIGNITE_PREFER_WAL_REBALANCE=true' ]) ignites = start_ignite(self.test_context, ignite_version, reb_params) control_utility = ControlUtility(ignites) control_utility.activate() preloader_config = ignites.config._replace( client_mode=True, discovery_spi=from_ignite_cluster(ignites)) preloader = IgniteApplicationService( self.test_context, preloader_config, java_class_name= "org.apache.ignite.internal.ducktest.tests.rebalance.DataGenerationApplication", params={ "backups": 1, "cacheCount": 1, "entrySize": 1, "from": 0, "to": preload_entries }) preloader.run() preloader.free() control_utility.deactivate() control_utility.activate() node = ignites.nodes[-1] ignites.stop_node(node) assert ignites.wait_node(node) preload_time = preload_data( self.test_context, ignites.config._replace( client_mode=True, discovery_spi=from_ignite_cluster(ignites)), rebalance_params=reb_params) control_utility.deactivate() control_utility.activate() self.logger.debug( f'DB size before rebalance: {get_database_size_mb(ignites.nodes, ignites.database_dir)}' ) ignites.start_node(node) ignites.await_started() rebalance_nodes = [node] await_and_check_rebalance(ignites, rebalance_nodes, False) result = get_result(rebalance_nodes, preload_time, cache_count, entry_count, entry_size) control_utility.deactivate() self.logger.debug( f'DB size after rebalance: {get_database_size_mb(ignites.nodes, ignites.database_dir)}' ) return result
def test_latency(self, ignite_version, stop_type, discovery_type, prep_type): """ Tests Cellular switch tx latency. """ cluster_size = self.available_cluster_size cells_amount = math.floor((cluster_size - self.ZOOKEPER_CLUSTER_SIZE) / (self.NODES_PER_CELL + 1)) assert cells_amount >= 2 self.test_context.logger.info( "Cells amount calculated as %d at cluster with %d nodes in total" % (cells_amount, cluster_size)) data = {} discovery_spi = None modules = [] d_type = DiscoreryType.construct_from(discovery_type) if d_type is DiscoreryType.ZooKeeper: zk_settings = ZookeeperSettings( min_session_timeout=self.ZOOKEPER_SESSION_TIMEOUT) zk_quorum = ZookeeperService(self.test_context, self.ZOOKEPER_CLUSTER_SIZE, settings=zk_settings) zk_quorum.start() modules.append('zookeeper') discovery_spi = from_zookeeper_cluster(zk_quorum) cell0, prepared_tx_loader1 = \ self.start_cell_with_prepared_txs(ignite_version, f'C{0}', discovery_spi, modules) if d_type is DiscoreryType.TCP: discovery_spi = from_ignite_cluster(cell0) assert discovery_spi is not None loaders = [prepared_tx_loader1] nodes = [cell0] failed_cell_id = 1 for cell_id in range(1, cells_amount): # per cell coll_cnt = self.PREPARED_TX_CNT if prep_type == TxPrepType.CELL_COLOCATED else 0 # should not affect switch speed dramatically, cause recovery but not waiting # avoiding C0 (as not affected) & C1 noncoll_cnt = self.PREPARED_TX_CNT * (cells_amount - 2) \ if cell_id == failed_cell_id and prep_type == TxPrepType.CELL_NONCOLOCATED else 0 # cause waiting for txs with failed primary (~ 3/(cells-1) of prepared tx amount) # avoiding C0 (as not affected) multi_cnt = self.PREPARED_TX_CNT * (cells_amount - 1) \ if cell_id == failed_cell_id and prep_type == TxPrepType.MULTIKEY else 0 node, prepared_tx_loader = \ self.start_cell_with_prepared_txs( ignite_version, f'C{cell_id}', discovery_spi, modules, coll_cnt, noncoll_cnt, multi_cnt) loaders.append(prepared_tx_loader) nodes.append(node) failed_loader = loaders[failed_cell_id] for node in [*nodes, *loaders]: node.await_started() streamers = [] for cell in range(0, cells_amount): streamers.append( self.start_tx_streamer(ignite_version, "C%d" % cell, discovery_spi, modules)) for streamer in streamers: # starts tx streaming with latency record (with some warmup). streamer.start_async() for streamer in streamers: streamer.await_started() ControlUtility(cell0).disable_baseline_auto_adjust() # baseline set. ControlUtility(cell0).activate() for loader in loaders: loader.await_event("ALL_TRANSACTIONS_PREPARED", 180, from_the_beginning=True) for streamer in streamers: streamer.await_event("WARMUP_FINISHED", 180, from_the_beginning=True) # node left with prepared txs. with StopType.construct_from(stop_type) as s_type: if s_type is StopType.SIGTERM: failed_loader.stop_async() elif s_type is StopType.SIGKILL: failed_loader.kill() elif s_type is StopType.DROP_NETWORK: failed_loader.drop_network() for streamer in streamers: streamer.await_event("Node left topology\\|Node FAILED", 60, from_the_beginning=True) for streamer in streamers: # just an assertion that we have PME-free switch. streamer.await_event("exchangeFreeSwitch=true", 60, from_the_beginning=True) for streamer in streamers: # waiting for streaming continuation. streamer.await_event("APPLICATION_STREAMED", 60) for streamer in streamers: # stops streaming and records results. streamer.stop_async() for streamer in streamers: streamer.await_stopped() cell = streamer.params["cell"] data["[%s cell %s]" % ("alive" if cell != failed_loader.params["cell"] else "broken", cell)] = \ "worst_latency=%s, tx_streamed=%s, measure_duration=%s" % ( streamer.extract_result("WORST_LATENCY"), streamer.extract_result("STREAMED"), streamer.extract_result("MEASURE_DURATION")) return data
def test(self, ignite_version, load_type): """ Tests PME-free switch scenario (node stop). """ data = {} caches = [CacheConfiguration(name='test-cache', backups=2, atomicity_mode='TRANSACTIONAL')] l_type = LoadType.construct_from(load_type) # Checking PME (before 2.8) vs PME-free (2.8+) switch duration, but # focusing on switch duration (which depends on caches amount) when long_txs is false and # on waiting for previously started txs before the switch (which depends on txs duration) when long_txs of true. if l_type is LoadType.EXTRA_CACHES: for idx in range(1, self.EXTRA_CACHES_AMOUNT): caches.append(CacheConfiguration(name="cache-%d" % idx, backups=2, atomicity_mode='TRANSACTIONAL')) config = IgniteConfiguration(version=IgniteVersion(ignite_version), caches=caches, cluster_state="INACTIVE") num_nodes = len(self.test_context.cluster) - 2 self.test_context.logger.info("Nodes amount calculated as %d." % num_nodes) ignites = IgniteService(self.test_context, config, num_nodes=num_nodes) ignites.start() if IgniteVersion(ignite_version) >= V_2_8_0: ControlUtility(ignites).disable_baseline_auto_adjust() ControlUtility(ignites).activate() client_config = config._replace(client_mode=True, discovery_spi=from_ignite_cluster(ignites, slice(0, num_nodes - 1))) long_tx_streamer = IgniteApplicationService( self.test_context, client_config, java_class_name="org.apache.ignite.internal.ducktest.tests.pme_free_switch_test.LongTxStreamerApplication", params={"cacheName": "test-cache"}, startup_timeout_sec=180) if l_type is LoadType.LONG_TXS: long_tx_streamer.start() single_key_tx_streamer = IgniteApplicationService( self.test_context, client_config, java_class_name="org.apache.ignite.internal.ducktest.tests.pme_free_switch_test." "SingleKeyTxStreamerApplication", params={"cacheName": "test-cache", "warmup": 1000}, startup_timeout_sec=180) single_key_tx_streamer.start() ignites.stop_node(ignites.nodes[num_nodes - 1]) single_key_tx_streamer.await_event("Node left topology", 60, from_the_beginning=True) if l_type is LoadType.LONG_TXS: time.sleep(30) # keeping txs alive for 30 seconds. long_tx_streamer.stop_async() single_key_tx_streamer.await_event("Node left topology", 60, from_the_beginning=True) single_key_tx_streamer.await_event("APPLICATION_STREAMED", 60) # waiting for streaming continuation. single_key_tx_streamer.stop() data["Worst latency (ms)"] = single_key_tx_streamer.extract_result("WORST_LATENCY") data["Streamed txs"] = single_key_tx_streamer.extract_result("STREAMED") data["Measure duration (ms)"] = single_key_tx_streamer.extract_result("MEASURE_DURATION") data["Server nodes"] = num_nodes return data
def _perform_node_fail_scenario(self, test_config): failure_detection_timeout = self._global_int( self.GLOBAL_DETECTION_TIMEOUT, self.DEFAULT_DETECTION_TIMEOUT) cluster_size = self.available_cluster_size # One node is required to detect the failure. assert cluster_size >= 1 + test_config.nodes_to_kill + ( self.ZOOKEEPER_NODES if test_config.with_zk else 0), \ f"Few required containers: {cluster_size}. Check the params." self.logger.info("Starting on " + str(cluster_size) + " maximal containers.") self.logger.info( f"{self.GLOBAL_DETECTION_TIMEOUT}: {failure_detection_timeout}") results = {} modules = ['zookeeper'] if test_config.with_zk else None if test_config.with_zk: zk_quorum = start_zookeeper(self.test_context, self.ZOOKEEPER_NODES, failure_detection_timeout) discovery_spi = from_zookeeper_cluster(zk_quorum) else: discovery_spi = TcpDiscoverySpi() if LATEST_2_7 < test_config.version <= V_2_9_0: discovery_spi.so_linger = 0 if test_config.disable_conn_recovery: discovery_spi.conn_recovery_timeout = 0 ignite_config = IgniteConfiguration( version=test_config.version, discovery_spi=discovery_spi, failure_detection_timeout=failure_detection_timeout, caches=[ CacheConfiguration( name='test-cache', backups=1, atomicity_mode='TRANSACTIONAL' if test_config.load_type == ClusterLoad.TRANSACTIONAL else 'ATOMIC') ]) # Start Ignite nodes in count less than max_nodes_in_use. One node is erequired for the loader. Some nodes might # be needed for ZooKeeper. servers, start_servers_sec = start_servers( self.test_context, cluster_size - self.ZOOKEEPER_NODES - 1, ignite_config, modules) results['Ignite cluster start time (s)'] = start_servers_sec failed_nodes = choose_node_to_kill(servers, test_config.nodes_to_kill, test_config.sequential_failure) if test_config.load_type is not ClusterLoad.NONE: load_config = ignite_config._replace(client_mode=True) if test_config.with_zk else \ ignite_config._replace(client_mode=True, discovery_spi=from_ignite_cluster(servers)) tran_nodes = [servers.node_id(n) for n in failed_nodes] \ if test_config.load_type == ClusterLoad.TRANSACTIONAL else None params = { "cacheName": "test-cache", "range": self.DATA_AMOUNT, "warmUpRange": self.WARMUP_DATA_AMOUNT, "targetNodes": tran_nodes, "transactional": bool(tran_nodes) } start_load_app(self.test_context, load_config, params, modules) # Detection timeout is 4 * failure_detection_timeout in seconds. detection_timeout_sec = 4 * ignite_config.failure_detection_timeout // 1000 results.update( self._simulate_and_detect_failure(servers, failed_nodes, detection_timeout_sec, test_config.net_part)) return results