def test_info_event(self): event = InfoEvent(message="m1") event.event_id = "aff29bce-d75c-4f86-9890-c6d9c1c25d3e" self.assertEqual( str(event), "(InfoEvent Severity.NORMAL) period_type=not-set " "event_id=aff29bce-d75c-4f86-9890-c6d9c1c25d3e: message=m1") self.assertEqual(event, pickle.loads(pickle.dumps(event)))
def _run_stress(self, loader, loader_idx, cpu_idx): KclStressEvent.start(node=loader, stress_cmd=self.stress_cmd).publish() try: options_str = self.stress_cmd.replace('table_compare', '').strip() options = dict(item.strip().split("=") for item in options_str.split(";")) interval = int(options.get('interval', 20)) src_table = options.get('src_table') dst_table = options.get('dst_table') while not self._stop_event.is_set(): node: BaseNode = self.db_node_to_query(loader) node.run_nodetool('flush') src_size = node.get_cfstats(src_table)['Number of partitions (estimate)'] dst_size = node.get_cfstats(dst_table)['Number of partitions (estimate)'] status = f"== CompareTablesSizesThread: dst table/src table number of partitions: {dst_size}/{src_size} ==" LOGGER.info(status) InfoEvent(status) if src_size == 0: continue if dst_size >= src_size: InfoEvent("== CompareTablesSizesThread: Done ==") break time.sleep(interval) return None except Exception as exc: # pylint: disable=broad-except errors_str = format_stress_cmd_error(exc) KclStressEvent.failure(node=loader, stress_cmd=self.stress_cmd, errors=[errors_str, ]).publish() raise finally: KclStressEvent.finish(node=loader).publish()
def _create_repair_and_alter_it_with_repair_control(self): keyspace_to_be_repaired = "keyspace2" if not self.is_cred_file_configured: self.update_config_file() manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME + '_repair_control', db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token) # writing 292968720 rows, equal to the amount of data written in the prepare (around 100gb per node), # to create a large data fault and therefore a longer running repair self.create_missing_rows_in_cluster(create_missing_rows_in_multiple_nodes=True, keyspace_to_be_repaired=keyspace_to_be_repaired, total_num_of_rows=292968720) arg_list = [{"intensity": .0001}, {"intensity": 0}, {"parallel": 1}, {"intensity": 2, "parallel": 1}] InfoEvent(message="Repair started") repair_task = mgr_cluster.create_repair_task(keyspace="keyspace2") next_percentage_block = 20 repair_task.wait_for_percentage(next_percentage_block) for args in arg_list: next_percentage_block += 20 InfoEvent(message=f"Changing repair args to: {args}") mgr_cluster.control_repair(**args) repair_task.wait_for_percentage(next_percentage_block) repair_task.wait_and_get_final_status(step=30) InfoEvent(message="Repair ended")
def _run_stress(self, loader, loader_idx, cpu_idx): KclStressEvent.start(node=loader, stress_cmd=self.stress_cmd).publish() try: options_str = self.stress_cmd.replace('table_compare', '').strip() options = dict(item.strip().split("=") for item in options_str.split(";")) interval = int(options.get('interval', 20)) timeout = int(options.get('timeout', 28800)) src_table = options.get('src_table') dst_table = options.get('dst_table') start_time = time.time() while not self._stop_event.is_set(): node: BaseNode = self.db_node_to_query(loader) node.running_nemesis = "Compare tables size by cf-stats" node.run_nodetool('flush') dst_size = node.get_cfstats( dst_table)['Number of partitions (estimate)'] src_size = node.get_cfstats( src_table)['Number of partitions (estimate)'] node.running_nemesis = None elapsed_time = time.time() - start_time status = f"== CompareTablesSizesThread: dst table/src table number of partitions: {dst_size}/{src_size} ==" LOGGER.info(status) status_msg = f'[{elapsed_time}/{timeout}] {status}' InfoEvent(status_msg).publish() if src_size == 0: continue if elapsed_time > timeout: InfoEvent( f"== CompareTablesSizesThread: exiting on timeout of {timeout}" ).publish() break time.sleep(interval) return None except Exception as exc: # pylint: disable=broad-except errors_str = format_stress_cmd_error(exc) KclStressEvent.failure(node=loader, stress_cmd=self.stress_cmd, errors=[ errors_str, ]).publish() raise finally: KclStressEvent.finish(node=loader).publish()
def test_repair_control(self): InfoEvent(message="Starting C-S write load") self.run_prepare_write_cmd() InfoEvent(message="Flushing") for node in self.db_cluster.nodes: node.run_nodetool("flush") InfoEvent(message="Waiting for compactions to end") self.wait_no_compactions_running(n=90, sleep_time=30) InfoEvent(message="Starting C-S read load") stress_read_thread = self.generate_background_read_load() time.sleep(600) # So we will see the base load of the cluster InfoEvent(message="Sleep ended - Starting tests") self._create_repair_and_alter_it_with_repair_control() load_results = stress_read_thread.get_results() self.log.info(f'load={load_results}')
def test_events_analyzer(self): start_events_analyzer(_registry=self.events_processes_registry) events_analyzer = get_events_process( name=EVENTS_ANALYZER_ID, _registry=self.events_processes_registry) time.sleep(EVENTS_SUBSCRIBERS_START_DELAY) try: self.assertIsInstance(events_analyzer, EventsAnalyzer) self.assertTrue(events_analyzer.is_alive()) self.assertEqual(events_analyzer._registry, self.events_main_device._registry) self.assertEqual(events_analyzer._registry, self.events_processes_registry) event1 = InfoEvent(message="m1") event2 = SpotTerminationEvent(node="n1", message="m2") with unittest.mock.patch( "sdcm.sct_events.events_analyzer.EventsAnalyzer.kill_test" ) as mock: with self.wait_for_n_events(events_analyzer, count=2, timeout=1): self.events_main_device.publish_event(event1) self.events_main_device.publish_event(event2) self.assertEqual(self.events_main_device.events_counter, events_analyzer.events_counter) mock.assert_called_once() finally: events_analyzer.stop(timeout=1)
def _repair_intensity_feature(self, fault_multiple_nodes): InfoEvent(message="Starting C-S write load").publish() self.run_prepare_write_cmd() InfoEvent(message="Flushing").publish() for node in self.db_cluster.nodes: node.run_nodetool("flush") InfoEvent(message="Waiting for compactions to end").publish() self.wait_no_compactions_running(n=30, sleep_time=30) InfoEvent(message="Starting C-S read load").publish() stress_read_thread = self.generate_background_read_load() time.sleep(600) # So we will see the base load of the cluster InfoEvent(message="Sleep ended - Starting tests").publish() with self.subTest('test_intensity_and_parallel'): self.test_intensity_and_parallel(fault_multiple_nodes=fault_multiple_nodes) load_results = stress_read_thread.get_results() self.log.info(f'load={load_results}')
def measure_nodes_space_amplification_after_write(self, dict_nodes_initial_capacity, written_data_size_gb, start_time): self.log.info(f"Space amplification results after a write of: {written_data_size_gb} are:") dict_nodes_space_amplification = self._get_nodes_space_ampl_over_time_gb( dict_nodes_initial_capacity=dict_nodes_initial_capacity, written_data_size_gb=written_data_size_gb, start_time=start_time) InfoEvent(message=f"Space amplification results after a write of: {written_data_size_gb} are: " f"{dict_nodes_space_amplification}").publish()
def test_intensity_and_parallel(self, fault_multiple_nodes): keyspace_to_be_repaired = "keyspace2" InfoEvent(message='starting test_intensity_and_parallel').publish() if not self.is_cred_file_configured: self.update_config_file() manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) mgr_cluster = manager_tool.add_cluster( name=self.CLUSTER_NAME + '_intensity_and_parallel', db_cluster=self.db_cluster, auth_token=self.monitors.mgmt_auth_token, ) InfoEvent(message="Starting faulty load (to be repaired)").publish() self.create_missing_rows_in_cluster(create_missing_rows_in_multiple_nodes=fault_multiple_nodes, keyspace_to_be_repaired=keyspace_to_be_repaired, total_num_of_rows=29296872) InfoEvent(message="Starting a repair with no intensity").publish() base_repair_task = mgr_cluster.create_repair_task(keyspace="keyspace*") base_repair_task.wait_and_get_final_status(step=30) assert base_repair_task.status == TaskStatus.DONE, "The base repair task did not end in the expected time" InfoEvent(message=f"The base repair, with no intensity argument, took {base_repair_task.duration}").publish() with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session: session.execute(f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}") arg_list = [{"intensity": .5}, {"intensity": .25}, {"intensity": .0001}, {"intensity": 2}, {"intensity": 4}, {"parallel": 1}, {"parallel": 2}, {"intensity": 2, "parallel": 1}, {"intensity": 100}, {"intensity": 0}] for arg_dict in arg_list: InfoEvent(message="Starting faulty load (to be repaired)").publish() self.create_missing_rows_in_cluster(create_missing_rows_in_multiple_nodes=fault_multiple_nodes, keyspace_to_be_repaired=keyspace_to_be_repaired, total_num_of_rows=29296872) InfoEvent(message=f"Starting a repair with {arg_dict}").publish() repair_task = mgr_cluster.create_repair_task(**arg_dict, keyspace="keyspace*") repair_task.wait_and_get_final_status(step=30) InfoEvent(message=f"repair with {arg_dict} took {repair_task.duration}").publish() with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session: session.execute(f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}") InfoEvent(message='finishing test_intensity_and_parallel').publish()
def _run_stress(self, loader, loader_idx, cpu_idx): KclStressEvent.start(node=loader, stress_cmd=self.stress_cmd).publish() try: src_table = self._options.get('src_table') dst_table = self._options.get('dst_table') end_time = time.time() + self._timeout while not self._stop_event.is_set(): node: BaseNode = self.db_node_to_query(loader) node.running_nemesis = "Compare tables size by cf-stats" node.run_nodetool('flush') dst_size = node.get_cfstats( dst_table)['Number of partitions (estimate)'] src_size = node.get_cfstats( src_table)['Number of partitions (estimate)'] node.running_nemesis = None status = f"== CompareTablesSizesThread: dst table/src table number of partitions: {dst_size}/{src_size} ==" LOGGER.info(status) InfoEvent(f'[{time.time()}/{end_time}] {status}').publish() if src_size == 0: continue if time.time() > end_time: InfoEvent( f"== CompareTablesSizesThread: exiting on timeout of {self._timeout}" ).publish() break time.sleep(self._interval) return None except Exception as exc: # pylint: disable=broad-except KclStressEvent.failure(node=loader, stress_cmd=self.stress_cmd, errors=[ format_stress_cmd_error(exc), ]).publish() raise finally: KclStressEvent.finish(node=loader).publish()
def test_ics_space_amplification_goal(self): # pylint: disable=too-many-locals """ (1) writing new data. wait for compactions to finish. (2) over-writing existing data. (3) measure space amplification after over-writing with SAG=None,1.5,1.2,None """ self._set_enforce_min_threshold_true() # (1) writing new data. prepare_write_cmd = self.params.get('prepare_write_cmd') InfoEvent(message=f"Starting C-S prepare load: {prepare_write_cmd}").publish() self.run_prepare_write_cmd() InfoEvent(message="Wait for compactions to finish after write is done.").publish() self.wait_no_compactions_running() stress_cmd = self.params.get('stress_cmd') sag_testing_values = [None, '1.5', '1.2', '1.5', None] column_size = 205 num_of_columns = 5 # the below number is 1TB (yaml stress cmd total write) in bytes / 205 (column_size) / 5 (num_of_columns) overwrite_ops_num = 1072694271 total_data_to_overwrite_gb = round(overwrite_ops_num * column_size * num_of_columns / (1024 ** 3), 2) min_threshold = '4' # (2) over-writing existing data. for sag in sag_testing_values: dict_nodes_capacity_before_overwrite_data = self._get_nodes_used_capacity() InfoEvent( message=f"Nodes used capacity before start overwriting data:" f" {dict_nodes_capacity_before_overwrite_data}").publish() additional_compaction_params = {'min_threshold': min_threshold} if sag: additional_compaction_params.update({'space_amplification_goal': sag}) # (3) Altering compaction with SAG=None,1.5,1.2,1.5,None self._alter_table_compaction(additional_compaction_params=additional_compaction_params) stress_queue = list() InfoEvent(message=f"Starting C-S over-write load: {stress_cmd}").publish() start_time = time.time() params = {'keyspace_num': 1, 'stress_cmd': stress_cmd, 'round_robin': self.params.get('round_robin')} self._run_all_stress_cmds(stress_queue, params) for stress in stress_queue: self.verify_stress_thread(cs_thread_pool=stress) InfoEvent(message="Wait for compactions to finish after over-write is done.").publish() self.wait_no_compactions_running() # (3) measure space amplification for the re-written data self.measure_nodes_space_amplification_after_write( dict_nodes_initial_capacity=dict_nodes_capacity_before_overwrite_data, written_data_size_gb=total_data_to_overwrite_gb, start_time=start_time) InfoEvent(message=f"Space-amplification-goal testing cycles are done.").publish()
def _alter_table_compaction(self, compaction_strategy=CompactionStrategy.INCREMENTAL, table_name='standard1', keyspace_name='keyspace1', additional_compaction_params: dict = None): """ Alters table compaction like: ALTER TABLE mykeyspace.mytable WITH compaction = {'class' : 'IncrementalCompactionStrategy'} """ base_query = f"ALTER TABLE {keyspace_name}.{table_name} WITH compaction = " dict_requested_compaction = {'class': compaction_strategy.value} if additional_compaction_params: dict_requested_compaction.update(additional_compaction_params) full_alter_query = base_query + str(dict_requested_compaction) LOGGER.debug(f"Alter table query is: {full_alter_query}") node1: BaseNode = self.db_cluster.nodes[0] node1.run_cqlsh(cmd=full_alter_query) InfoEvent(message=f"Altered table by: {full_alter_query}").publish()
def test_latency(self): """ Test steps: 1. Prepare cluster with data (reach steady_stet of compactions and ~x10 capacity than RAM. with round_robin and list of stress_cmd - the data will load several times faster. 2. Run WRITE workload with gauss population. """ self.run_pre_create_keyspace() self.run_fstrim_on_all_db_nodes() self.preload_data() for workload in self.ycsb_workloads: self.wait_no_compactions_running() self.run_fstrim_on_all_db_nodes() InfoEvent(message="Starting YCSB %s (%s)" % (workload.name, workload.detailed_name)).publish() self.run_workload(stress_cmd=self._create_stress_cmd(workload), sub_type=workload.sub_type)
def info_event(self) -> Generator[InfoEvent, None, None]: yield InfoEvent(message="This is a mock InfoEvent")
def test_info_event(self): event = InfoEvent(message="m1") self.assertEqual(str(event), "(InfoEvent Severity.NORMAL): message=m1") self.assertEqual(event, pickle.loads(pickle.dumps(event)))