def collect_latency(monitor_node, start, end, load_type, cluster, nodes_list): res = {} prometheus = PrometheusDBStats(host=monitor_node.external_address) duration = int(end - start) cassandra_stress_precision = ['99', '95'] # in the future should include also 'max' scylla_precision = ['99'] # in the future should include also '95', '5' for precision in cassandra_stress_precision: metric = f'c-s {precision}' if precision == 'max' else f'c-s P{precision}' if not precision == 'max': precision = f'perc_{precision}' query = f'collectd_cassandra_stress_{load_type}_gauge{{type="lat_{precision}"}}' query_res = prometheus.query(query, start, end) latency_values_lst = [] max_latency_values_lst = [] for entry in query_res: if not entry['values']: continue sequence = [float(val[-1]) for val in entry['values'] if not val[-1].lower() == 'nan'] if not sequence or all(val == sequence[0] for val in sequence): continue latency_values_lst.extend(sequence) max_latency_values_lst.extend(sequence) if latency_values_lst: res[metric] = float(format(avg(latency_values_lst), '.2f')) if max_latency_values_lst: res[f'{metric} max'] = float(format(max(max_latency_values_lst), '.2f')) if load_type == 'mixed': load_type = ['read', 'write'] else: load_type = [load_type] for load in load_type: for precision in scylla_precision: query = f'histogram_quantile(0.{precision},sum(rate(scylla_storage_proxy_coordinator_{load}_' \ f'latency_bucket{{}}[{duration}s])) by (instance, le))' query_res = prometheus.query(query, start, end) for entry in query_res: node_ip = entry['metric']['instance'].replace('[', '').replace(']', '') node = cluster.get_node_by_ip(node_ip) if not node: for db_node in nodes_list: if db_node.ip_address == node_ip: node = db_node if node: node_idx = node.name.split('-')[-1] else: continue node_name = f'node-{node_idx}' metric = f"Scylla P{precision}_{load} - {node_name}" if not entry['values']: continue sequence = [float(val[-1]) for val in entry['values'] if not val[-1].lower() == 'nan'] if sequence: res[metric] = float(format(avg(sequence) / 1000, '.2f')) return res
def create_prometheus_snapshot(self, node): prometheus_client = PrometheusDBStats(host=node.external_address) result = prometheus_client.create_snapshot() if result and "success" in result['status']: snapshot_dir = os.path.join(self.monitoring_data_dir, "snapshots", result['data']['name']) return snapshot_dir else: raise PrometheusSnapshotErrorException(result)
def prepare_schema(self): self.prometheus_stats = PrometheusDBStats( host=self.monitors.nodes[0].public_ip_address) self.connection_cql = self.db_cluster.cql_connection_patient( node=self.db_cluster.nodes[0], user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD) session = self.connection_cql.session return session
def check_prometheus_metrics(self, start_time, now): prometheus = PrometheusDBStats(self.monitors.nodes[0].public_ip_address) node_procs_blocked = 'scylla_transport_requests_blocked_memory' node_procs_res = prometheus.query(node_procs_blocked, start_time, now) is_admission_control_triggered = False for node in node_procs_res: if int(node['values'][0][1]) > 0: self.log.info('Admission control was triggered') is_admission_control_triggered = True return is_admission_control_triggered
def verify_prometheus_is_available(): """Get result from prometheus for latest 10 minutes Validate that request to Prometheus container is not failed :returns: True if request is successful, False otherwise :rtype: {bool} """ from sdcm.db_stats import PrometheusDBStats time_end = time.time() time_start = time_end - 600 try: LOGGER.info("Send request to Prometheus") prom_client = PrometheusDBStats("localhost", port=PROMETHEUS_DOCKER_PORT) prom_client.get_throughput(time_start, time_end) LOGGER.info("Prometheus is up") return True except Exception as details: # pylint: disable=broad-except LOGGER.error("Error requesting prometheus %s", details) return False
class SlaPerUserTest(LongevityTest): """ Test SLA per user feature using cassandra-stress. """ STRESS_WRITE_CMD = 'cassandra-stress write cl=QUORUM n={n} -schema \'replication(factor=3)\' -port jmx=6868 ' \ '-mode cql3 native user={user} password={password} -rate threads={threads}' STRESS_WRITE_DURATION_CMD = 'cassandra-stress write cl=ALL duration={duration} -schema \'replication(factor=3)\' ' \ '-port jmx=6868 -mode cql3 native user={user} password={password} -rate threads={threads} ' \ 'throttle=10000/s -pop seq={pop}' STRESS_READ_CMD = 'cassandra-stress read cl=ALL duration={duration} -port jmx=6868 -mode cql3 native user={user} ' \ 'password={password} -rate threads={threads} -pop {pop}' STRESS_MIXED_CMD = r"cassandra-stress mixed ratio\(write={write_ratio},read={write_ratio}\) cl=QUORUM " \ "duration={duration} -port jmx=6868 " \ "-mode cql3 native user={user} password={password} -rate threads={threads} -pop {pop} " DEFAULT_USER = '******' DEFAULT_USER_PASSWORD = '******' DEFAULT_USER_SLA = 'sla_cassandra' DEFAULT_SHARES = 1000 VALID_DEVIATION_PRC = 10 MIN_CPU_UTILIZATION = 97 WORKLOAD_LATENCY = 'latency' WORKLOAD_THROUGHPUT = 'throughput' CACHE_ONLY_LOAD = 'cache_only' DISK_ONLY_LOAD = 'disk_only' MIXED_LOAD = 'mixed' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.prometheus_stats = None self.num_of_partitions = 50000000 self.backgroud_task = None self.class_users = {} self.connection_cql = None def prepare_schema(self): self.prometheus_stats = PrometheusDBStats( host=self.monitors.nodes[0].public_ip_address) self.connection_cql = self.db_cluster.cql_connection_patient( node=self.db_cluster.nodes[0], user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD) session = self.connection_cql.session return session def create_test_data_and_wait_no_compaction(self, rows_amount=None): # Prefill data before tests if rows_amount is not None: self.num_of_partitions = rows_amount write_cmd = self.STRESS_WRITE_CMD.format( n=self.num_of_partitions, user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD, threads=250) self.run_stress_and_verify_threads( params={ 'stress_cmd': write_cmd, 'prefix': 'preload-', 'stats_aggregate_cmds': False }) self.wait_no_compactions_running(n=120) @staticmethod def user_to_scheduler_group(test_users, scheduler_shares): for user, shares in test_users.items(): for scheduler_group, sg_shares in scheduler_shares.items(): if shares[0] in sg_shares: test_users[user].append(scheduler_group) break return test_users def validate_scheduler_runtime(self, start_time, end_time, read_users, expected_ratio): users_with_shares = { user['user'].name: [user['service_level'].service_shares] for user in read_users } for node_ip in self.db_cluster.get_node_private_ips(): # Temporary solution scheduler_shares = self.prometheus_stats.get_scylla_scheduler_shares_per_sla( start_time, end_time, node_ip) self.log.debug('SCHEDULERS SHARES FROM PROMETHEUS: {}'.format( scheduler_shares)) if 'service_level_sg_0' in scheduler_shares: scheduler_shares.pop('service_level_sg_0') test_users_to_sg = self.user_to_scheduler_group( test_users=users_with_shares, scheduler_shares=scheduler_shares) self.log.debug('USER - SERVICE LEVEL - SCHEDULER: {}'.format( test_users_to_sg)) # End Temporary solution shards_time_per_sla = self.prometheus_stats.get_scylla_scheduler_runtime_ms( start_time, end_time, node_ip) if not (shards_time_per_sla and scheduler_shares): # Set this message as WARNING because I found that prometheus return empty answer despite the data # exists (I run this request manually and got data). Prometheus request doesn't fail, it succeeded but # empty, like: # {'status': 'success', 'data': {'resultType': 'matrix', 'result': []}} WorkloadPrioritisationEvent.EmptyPrometheusData( message=f'Failed to get scheduler_runtime data from ' f'Prometheus for node {node_ip}', severity=Severity.WARNING).publish() continue runtime_per_user = {} for username, val in test_users_to_sg.items(): if val[1] in shards_time_per_sla[node_ip]: runtime_per_user[username] = sum(shards_time_per_sla[node_ip][val[1]]) / \ len(shards_time_per_sla[node_ip][val[1]]) else: runtime_per_user[username] = 0 self.log.debug('RUN TIME PER USER: {}'.format(runtime_per_user)) actual_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users, metrics=runtime_per_user) self.validate_deviation( expected_ratio=expected_ratio, actual_ratio=actual_shares_ratio, msg=f'Validate scheduler CPU runtime on the node {node_ip}. ' f'Run time per user: {runtime_per_user}') @staticmethod def create_auths(entities_list_of_dict): """ :param entities_list_of_dict: Expected structure: [{'user': User(), 'role': Role(), 'service_level': ServiceLevel()}, OR {'user': User(), 'service_level': ServiceLevel()}, OR {'role': Role(), 'service_level': ServiceLevel()} ] """ for entity in entities_list_of_dict: service_level = entity.get('service_level') role = entity.get('role') user = entity.get('user') if service_level: service_level.create() if role: role.create() role.attach_service_level(service_level=service_level) if user: user.create() if role: role.grant_me_to(grant_to=user) else: user.attach_service_level(service_level=service_level) @staticmethod def validate_ratio(expected_ratio, actual_ratio, msg): if not (expected_ratio and actual_ratio): WorkloadPrioritisationEvent.RatioValidationEvent( message= f'Can\'t compare expected and actual shares ratio. Expected: {expected_ratio}. ' f'Actual: {actual_ratio}', severity=Severity.ERROR).publish() if expected_ratio <= actual_ratio: WorkloadPrioritisationEvent.RatioValidationEvent( message= f'{msg}. Actual ratio ({actual_ratio}) is as expected (more or equal then expected ratio ' f'{expected_ratio})', severity=Severity.NORMAL).publish() else: WorkloadPrioritisationEvent.RatioValidationEvent( message= f'{msg}. Actual ratio ({actual_ratio}) is less then expected ratio ({expected_ratio})', severity=Severity.ERROR).publish() def validate_deviation(self, expected_ratio, actual_ratio, msg): dev = self.calculate_deviation(expected_ratio, actual_ratio) if not dev: WorkloadPrioritisationEvent.RatioValidationEvent( message= f'Can\'t compare expected and actual shares ratio. Expected: {expected_ratio}. ' f'Actual: {actual_ratio}', severity=Severity.ERROR).publish() if dev > self.VALID_DEVIATION_PRC: WorkloadPrioritisationEvent.RatioValidationEvent( message= f'{msg}. Actual ratio ({actual_ratio}) is not as expected ({expected_ratio})', severity=Severity.ERROR).publish() else: WorkloadPrioritisationEvent.RatioValidationEvent( message= f'{msg}. Actual ratio ({actual_ratio}) is as expected ({expected_ratio})', severity=Severity.NORMAL).publish() @staticmethod def calculate_deviation(first, second): if first and second: _first, _second = (first, second) if first > second else (second, first) dev = float(abs(_first - _second) * 100 / _second) return dev return None @staticmethod def calculate_metrics_ratio_per_user(two_users_list, metrics=None): # pylint: disable=invalid-name """ :param metrics: calculate ratio for specific Scylla or cassandra-stress metrics (ops, scheduler_runtime etc..). If metrics name is not defined - ration will be calculated for service_shares """ if two_users_list[0]['service_level'].service_shares > two_users_list[ 1]['service_level'].service_shares: high_shares_user = two_users_list[0] low_shares_user = two_users_list[1] else: high_shares_user = two_users_list[1] low_shares_user = two_users_list[0] if metrics: high_shares_metrics = metrics[high_shares_user['user'].name] low_shares_metrics = metrics[low_shares_user['user'].name] else: high_shares_metrics = high_shares_user[ 'service_level'].service_shares low_shares_metrics = low_shares_user[ 'service_level'].service_shares if not high_shares_metrics or not low_shares_metrics: return None return float(high_shares_metrics) / float(low_shares_metrics) def run_stress_and_verify_threads(self, params=None): read_queue = [] self._run_all_stress_cmds(read_queue, params=params) for queue in read_queue: self.verify_stress_thread(cs_thread_pool=queue) return read_queue def one_run_c_s_stats(self, read_run, user_name, statistic_name): res = self.get_stress_results(queue=read_run, store_results=False) stat_rate, username = None, None if res: stat_rate = res[0].get(statistic_name) username = res[0].get('username') if not (stat_rate and username): self.log.error( f'Stress statistics are not received for user {user_name}. Can\'t complete the test' ) return None return stat_rate, username def get_c_s_stats(self, read_queue, users, statistic_name): users_names = [user['user'].name for user in users] results = {} for i, read in enumerate(read_queue): stat_rate, username = self.one_run_c_s_stats( read_run=read, user_name=users_names[i], statistic_name=statistic_name) if stat_rate is None: return stat_rate self.assertEqual( username, users_names[i], msg= f'Expected that stress was run with user "{users_names[i]}" but it was "{username}"' ) results[username] = float(stat_rate) return results def validate_if_scylla_load_high_enough(self, start_time, wait_cpu_utilization): # pylint: disable=invalid-name end_time = int(time.time()) scylla_load = self.prometheus_stats.get_scylla_reactor_utilization( start_time=start_time, end_time=end_time) if scylla_load < wait_cpu_utilization: WorkloadPrioritisationEvent.CpuNotHighEnough( f"Load {scylla_load} isn\'t high enough(expected at least {wait_cpu_utilization}). " f"The test results may be not correct.", severity=Severity.ERROR) def clean_auth(self, entities_list_of_dict): for entity in entities_list_of_dict: service_level = entity.get('service_level') role = entity.get('role') user = entity.get('user') if user: user.drop() if role: role.drop() if service_level: service_level.drop() self.backgroud_task = None self.connection_cql.cluster.shutdown() def warm_up_cache_before_test(self, max_key_for_read, stress_duration): read_cmds = [ self.STRESS_READ_CMD.format(n=self.num_of_partitions, user=self.DEFAULT_USER, password=self.DEFAULT_USER, pop="seq=1..%d" % max_key_for_read, duration='%dm' % stress_duration, threads=200) ] self.run_stress_and_verify_threads(params={'stress_cmd': read_cmds}) # pylint: disable=too-many-arguments, too-many-locals def define_read_cassandra_stress_command(self, user, load_type, workload_type, threads, stress_duration_min, max_rows_for_read=None, stress_command=STRESS_READ_CMD, throttle=20000, **kwargs): """ :param user: dict with User/Role/ServiceLevel objects :param load_type: cache_only/disk_only/mixed :param workload_type: latency: with ops restriction - using throttle or throughput: no restriction """ def latency(): return '%d throttle=%d/s' % (threads, throttle) def throughput(): # pylint: disable=unused-variable return threads def cache_only(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = int(self.num_of_partitions * 0.3) return 'seq=1..%d' % max_rows_for_read # Read from cache and disk def mixed(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = self.num_of_partitions return "'dist=gauss(1..%d, %d, %d)'" % ( max_rows_for_read, int( max_rows_for_read / 2), int(max_rows_for_read * 0.05)) def disk_only(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = int(self.num_of_partitions * 0.3) return 'seq=%d..%d' % (max_rows_for_read, max_rows_for_read + int(self.num_of_partitions * 0.25)) user_name = user['user'].name rate = locals()[workload_type]( ) # define -rate for c-s command depend on workload type pop = locals()[load_type]( max_rows_for_read ) # define -pop for c-s command depend on load type params = { 'n': self.num_of_partitions, 'user': user_name, 'password': user_name, 'pop': pop, 'duration': '%dm' % stress_duration_min, 'threads': rate } if kwargs: params.update(kwargs['kwargs']) c_s_cmd = stress_command.format(**params) return c_s_cmd def test_read_throughput_1to5_ratio(self): """ Basic test - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 with 950 shares - Each user runs load from own loader (round robin) - Expect OPS ratio between two loads is 1:5 (e.g. 190:950) - Expect scheduler run time between two loads is 1:5 (e.g. 190:950) Load from cache """ self._two_users_load_throughput_workload(shares=[190, 950], load=self.MIXED_LOAD) def _two_users_load_throughput_workload(self, shares, load): session = self.prepare_schema() self.create_test_data_and_wait_no_compaction() # Define Service Levels/Roles/Users read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, service_shares=share) }) # expected_shares_ratio = self.calculate_metrics_ratio_per_user(two_users_list=read_users) # According to Eliran Sinvani expected_shares_ratio = 4.0 # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) stress_duration = 10 # minutes read_cmds = [ self.define_read_cassandra_stress_command( user=read_users[0], load_type=load, workload_type=self.WORKLOAD_THROUGHPUT, threads=1000, stress_duration_min=stress_duration), self.define_read_cassandra_stress_command( user=read_users[1], load_type=load, workload_type=self.WORKLOAD_THROUGHPUT, threads=1000, stress_duration_min=stress_duration) ] try: start_time = time.time() read_queue = self.run_stress_and_verify_threads(params={ 'stress_cmd': read_cmds, 'round_robin': True }) results = self.get_c_s_stats(read_queue=read_queue, users=read_users, statistic_name='op rate') self.validate_if_scylla_load_high_enough( start_time=start_time, wait_cpu_utilization=self.MIN_CPU_UTILIZATION) end_time = time.time() self.validate_scheduler_runtime( start_time=start_time, end_time=end_time, read_users=read_users, expected_ratio=expected_shares_ratio) self.assertTrue(results, msg='Not received cassandra-stress results') self.log.debug('Validate cassandra-stress ops deviation') actual_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users, metrics=results) self.validate_ratio(expected_ratio=expected_shares_ratio, actual_ratio=actual_shares_ratio, msg='Validate cassandra-stress ops') finally: self.clean_auth(entities_list_of_dict=read_users) def test_read_throughput_vs_latency_cache_and_disk(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the full data set (that is read from both the cache and the disk) Throughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 10 # minutes shares = [190, 950] read_users = [] session = self.prepare_schema() self.create_test_data_and_wait_no_compaction() # Define Service Levels/Roles/Users for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, service_shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) # Define stress commands read_cmds = { 'throughput': self.define_read_cassandra_stress_command( user=read_users[0], load_type=self.MIXED_LOAD, workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration), 'latency': self.define_read_cassandra_stress_command( user=read_users[1], load_type=self.MIXED_LOAD, workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration) } # TODO: improvement_expected number and calculation of actual improvement was set by Eliran for chache only # TODO: test. Should be adjusted for this test improvement_expected = 1.8 self._throughput_latency_tests_run( read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1], improvement_expected=improvement_expected) def test_read_throughput_vs_latency_cache_only(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the data set that fully exists in the cache Throughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of run alone user950 workload """ stress_duration = 5 # minutes shares = [190, 950] # Select part of the record to warm the cache (all this data will be in the cache). # This amount of data will be read during the test from cache max_key_for_read = int(self.num_of_partitions * 0.5) read_users = [] session = self.prepare_schema() self.create_test_data_and_wait_no_compaction() # Warm up the cache to guarantee the read will be from disk self.warm_up_cache_before_test(max_key_for_read=max_key_for_read, stress_duration=30) # Define Service Levels/Roles/Users for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, service_shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'throughput': self.define_read_cassandra_stress_command( user=read_users[0], load_type=self.CACHE_ONLY_LOAD, workload_type=self.WORKLOAD_THROUGHPUT, threads=950, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read), 'latency': self.define_read_cassandra_stress_command( user=read_users[1], load_type=self.CACHE_ONLY_LOAD, workload_type=self.WORKLOAD_LATENCY, threads=1000, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read), 'latency_throughput': self.define_read_cassandra_stress_command( user=read_users[1], load_type=self.CACHE_ONLY_LOAD, workload_type=self.WORKLOAD_THROUGHPUT, threads=1000, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read) } # improvement_expected number and calculation of actual improvement was set by Eliran improvement_expected = 1.8 self._throughput_latency_tests_run( read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1], improvement_expected=improvement_expected) def test_read_throughput_vs_latency_disk_only(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the data set that fully exists in the cache Throughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 5 # minutes session = self.prepare_schema() self.create_test_data_and_wait_no_compaction() for node in self.db_cluster.nodes: node.stop_scylla_server(verify_up=False, verify_down=True) node.start_scylla_server(verify_up=True, verify_down=False) # Select part of the record to warm the cache (all this data will be in the cache). # cassandra-stress "-pop" parameter will start from more then "max_key_for_cache" row number # (for read from the disk) max_key_for_cache = int(self.num_of_partitions * 0.25) # Warm up the cache to guarantee the read will be from disk self.warm_up_cache_before_test(max_key_for_read=max_key_for_cache, stress_duration=30) # Define Service Levels/Roles/Users shares = [190, 950] read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, service_shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'throughput': self.define_read_cassandra_stress_command( user=read_users[0], load_type=self.DISK_ONLY_LOAD, workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache * 2), 'latency': self.define_read_cassandra_stress_command( user=read_users[1], load_type=self.DISK_ONLY_LOAD, workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache * 3), 'latency_only': self.define_read_cassandra_stress_command( user=read_users[1], load_type=self.DISK_ONLY_LOAD, workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache) } # TODO: improvement_expected number and calculation of actual improvement was set by Eliran for chache only # TODO: test. Should be adjusted for this test improvement_expected = 1.8 self._throughput_latency_tests_run( read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1], improvement_expected=improvement_expected) def test_read_50perc_write_50perc_load(self): """ Test scenario: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 with 950 shares - Each user runs load from own loader (round robin) - Expect OPS ratio between two loads is 1:5 (e.g. 190:950) - Expect scheduler run time between two loads is 1:5 (e.g. 190:950) """ session = self.prepare_schema() self.create_test_data_and_wait_no_compaction() stress_duration_min = 10 # Define Service Levels/Roles/Users shares = [190, 950] read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, service_shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'throughput': self.define_read_cassandra_stress_command( user=read_users[0], load_type=self.MIXED_LOAD, workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), 'latency': self.define_read_cassandra_stress_command( user=read_users[1], load_type=self.MIXED_LOAD, workload_type=self.WORKLOAD_LATENCY, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }) } # TODO: improvement_expected number and calculation of actual improvement was set by Eliran for chache only # TODO: test. Should be adjusted for this test improvement_expected = 1.8 self._throughput_latency_tests_run( read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1], improvement_expected=improvement_expected) def _throughput_latency_tests_run(self, read_cmds, read_users, latency_user, improvement_expected): # pylint: disable=too-many-locals # Run latency workload test_start_time = time.time() self.log.debug('Start latency only workload') read_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [read_cmds.get('latency_only') or read_cmds['latency']], 'round_robin': True }) latency_99_for_latency_workload = self.get_c_s_stats( read_queue=read_queue, users=[latency_user], statistic_name='latency 99th percentile') self.assertTrue( latency_99_for_latency_workload, msg='Not received cassandra-stress results for latency ' 'workload') # Run throughput (user950) and latency (user950) workloads latency_workload_same_user, throughput_user950_workload, user950_result_print_str = \ self._throughput_latency_parallel_run(read_cmds=read_cmds, test_start_time=test_start_time, latency_99_for_latency_workload=latency_99_for_latency_workload, latency_user=latency_user, throughput_user=latency_user, throughput_cmd_name='latency_throughput', latency_cmd_name='latency') # Run throughput (user150) and latency (user950) workloads latency_workload_mixed_users, throughput_user150_workload, user150_result_print_str = \ self._throughput_latency_parallel_run(read_cmds=read_cmds, test_start_time=test_start_time, latency_99_for_latency_workload=latency_99_for_latency_workload, latency_user=latency_user, throughput_user=read_users[0], throughput_cmd_name='throughput', latency_cmd_name='latency') self.log.info( f"Result of run with user950 throughput and user950 latency workloads: " f"{user950_result_print_str}") self.log.info( f"Result of run with user150 throughput and user950 latency workloads: " f"{user150_result_print_str}") improvement_actual = (throughput_user950_workload * latency_workload_mixed_users) / \ (throughput_user150_workload * latency_workload_same_user) if improvement_actual > improvement_expected: WorkloadPrioritisationEvent.SlaTestResult( message= f'Actual improvement is {improvement_actual} more then {improvement_expected} as expected.', severity=Severity.NORMAL).publish() else: WorkloadPrioritisationEvent.SlaTestResult( message= f'Actual improvement is {improvement_actual} less then expected {improvement_expected}', severity=Severity.ERROR).publish() self.clean_auth(entities_list_of_dict=read_users) def _throughput_latency_parallel_run(self, read_cmds, test_start_time, latency_99_for_latency_workload, latency_user, throughput_user, throughput_cmd_name, latency_cmd_name): def __get_stat_for_user(read, user_name): # This is handle case when both loads (latency and throughput) are run for the same user stat_rate, username = self.one_run_c_s_stats( read_run=read, user_name=user_name, statistic_name='latency 99th percentile') if stat_rate: latency_99_for_mixed_workload[user_name] = float(stat_rate) self.log.debug( f'Start latency workload (user {latency_user}) in parallel with throughput workload ' f'(user {throughput_user})') read_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [read_cmds[throughput_cmd_name], read_cmds[latency_cmd_name]], 'round_robin': True }) latency_99_for_mixed_workload = {} # Get stats for throughput user load __get_stat_for_user(read=read_queue[0], user_name=throughput_cmd_name) # Get stats for latency user load __get_stat_for_user(read=read_queue[1], user_name=latency_cmd_name) self.assertTrue(latency_99_for_mixed_workload, msg='Not received cassandra-stress for mixed workload') grafana_dataset = self.monitors.get_grafana_screenshot_and_snapshot( test_start_time=test_start_time) grafana_screenshots = grafana_dataset.get('screenshots', []) grafana_snapshots = grafana_dataset.get('snapshots', []) self.log.debug('GRAFANA SCREENSHOTS: {}'.format(grafana_screenshots)) self.log.debug('GRAFANA SNAPSHOTS: {}'.format(grafana_snapshots)) # Compare latency of two runs self.log.debug('Test results:\n---------------------\n') latency_99_latency_workload = latency_99_for_latency_workload[ latency_user['user'].name] latency_99_mixed_workload = latency_99_for_mixed_workload[ latency_cmd_name] deviation = self.calculate_deviation(latency_99_latency_workload, latency_99_mixed_workload) latency_change = 'increased' if latency_99_mixed_workload > latency_99_latency_workload else 'decreased' result_print_str = '\nTest results:\n---------------------\n' result_print_str += '\nWorkload | Latency 99%' result_print_str += '\n========================= | =================' result_print_str += '\nLatency only | {}'.format( latency_99_latency_workload) result_print_str += '\nLatency and throughput | {}'.format( latency_99_mixed_workload) result_print_str += '\n------------------------- | -----------------' result_print_str += '\nLatency 99 is {} in {}%'.format( latency_change, deviation) return latency_99_latency_workload, latency_99_mixed_workload, result_print_str
class SlaPerUserTest(LongevityTest): """ Test SLA per user feature using cassandra-stress. """ STRESS_WRITE_CMD = 'cassandra-stress write cl=QUORUM n={n} -schema \'replication(factor=3)\' ' \ '-mode cql3 native user={user} password={password} -rate threads={threads}' STRESS_WRITE_DURATION_CMD = 'cassandra-stress write cl=ALL duration={duration} -schema \'replication(factor=3)\' ' \ '-mode cql3 native user={user} password={password} -rate threads={threads} ' \ 'throttle=10000/s -pop seq={pop}' STRESS_READ_CMD = 'cassandra-stress read cl=ALL duration={duration} -mode cql3 native user={user} ' \ 'password={password} -rate threads={threads} -pop {pop}' STRESS_MIXED_CMD = r"cassandra-stress mixed ratio\(write={write_ratio},read={write_ratio}\) cl=QUORUM " \ "duration={duration} " \ "-mode cql3 native user={user} password={password} -rate threads={threads} -pop {pop} " DEFAULT_USER = '******' DEFAULT_USER_PASSWORD = '******' DEFAULT_USER_SLA = 'sla_cassandra' DEFAULT_SHARES = 1000 VALID_DEVIATION_PRC = 10 MIN_CPU_UTILIZATION = 97 WORKLOAD_LATENCY = 'latency' WORKLOAD_THROUGHPUT = 'throughput' CACHE_ONLY_LOAD = 'cache_only' DISK_ONLY_LOAD = 'disk_only' MIXED_LOAD = 'mixed' WORKLOAD_TYPES_INDEX = "workload_tests" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.prometheus_stats = None self.num_of_partitions = 200000000 self.backgroud_task = None self.class_users = {} self.connection_cql = None self._comparison_results = {} self._es = ES() def prepare_schema(self): self.prometheus_stats = PrometheusDBStats( host=self.monitors.nodes[0].public_ip_address) self.connection_cql = self.db_cluster.cql_connection_patient( node=self.db_cluster.nodes[0], user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD) session = self.connection_cql.session return session def create_test_data(self, rows_amount=None): # Prefill data before tests if rows_amount is not None: self.num_of_partitions = rows_amount write_cmd = self.STRESS_WRITE_CMD.format( n=self.num_of_partitions, user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD, threads=250) self.run_stress_and_verify_threads( params={ 'stress_cmd': write_cmd, 'prefix': 'preload-', 'stats_aggregate_cmds': False }) @staticmethod def user_to_scheduler_group(test_users, scheduler_shares): for user, shares in test_users.items(): for scheduler_group, sg_shares in scheduler_shares.items(): if shares[0] in sg_shares: test_users[user].append(scheduler_group) break return test_users def validate_scheduler_runtime(self, start_time, end_time, read_users, expected_ratio): users_with_shares = { user['user'].name: [user['service_level'].service_shares] for user in read_users } for node_ip in self.db_cluster.get_node_private_ips(): # Temporary solution scheduler_shares = self.prometheus_stats.get_scylla_scheduler_shares_per_sla( start_time, end_time, node_ip) self.log.debug('SCHEDULERS SHARES FROM PROMETHEUS: {}'.format( scheduler_shares)) if 'service_level_sg_0' in scheduler_shares: scheduler_shares.pop('service_level_sg_0') test_users_to_sg = self.user_to_scheduler_group( test_users=users_with_shares, scheduler_shares=scheduler_shares) self.log.debug('USER - SERVICE LEVEL - SCHEDULER: {}'.format( test_users_to_sg)) # End Temporary solution shards_time_per_sla = self.prometheus_stats.get_scylla_scheduler_runtime_ms( start_time, end_time, node_ip) if not (shards_time_per_sla and scheduler_shares): continue runtime_per_user = {} for username, val in test_users_to_sg.items(): if val[1] in shards_time_per_sla[node_ip]: runtime_per_user[username] = sum(shards_time_per_sla[node_ip][val[1]]) / \ len(shards_time_per_sla[node_ip][val[1]]) else: runtime_per_user[username] = 0 self.log.debug('RUN TIME PER USER: {}'.format(runtime_per_user)) actual_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users, metrics=runtime_per_user) self.validate_deviation( expected_ratio=expected_ratio, actual_ratio=actual_shares_ratio, msg='Validate scheduler CPU runtime on the node %s' % node_ip) @staticmethod def create_auths(entities_list_of_dict): """ :param entities_list_of_dict: Expected structure: [{'user': User(), 'role': Role(), 'service_level': ServiceLevel()}, OR {'user': User(), 'service_level': ServiceLevel()}, OR {'role': Role(), 'service_level': ServiceLevel()} ] """ for entity in entities_list_of_dict: service_level = entity.get('service_level') role = entity.get('role') user = entity.get('user') if service_level: service_level.create() if role: role.create() role.attach_service_level(service_level=service_level) if user: user.create() if role: role.grant_me_to(grant_to=user) else: user.attach_service_level(service_level=service_level) def validate_deviation(self, expected_ratio, actual_ratio, msg): dev = self.calculate_deviation(expected_ratio, actual_ratio) self.assertIsNotNone( dev, 'Can\'t compare expected and actual shares ratio. Expected: ' '{expected_ratio}. Actual: {actual_ratio}'.format( expected_ratio=expected_ratio, actual_ratio=actual_ratio)) # TODO: formulate error message self.assertTrue( dev <= self.VALID_DEVIATION_PRC, '{msg}. Actual shares ratio ({actual_ratio}) is not ' 'as expected ({expected_ratio})'.format( msg=msg, actual_ratio=actual_ratio, expected_ratio=expected_ratio)) @staticmethod def calculate_deviation(first, second): if first and second: _first, _second = (first, second) if first > second else (second, first) dev = float(abs(_first - _second) * 100 / _second) return dev return None @staticmethod def calculate_metrics_ratio_per_user(two_users_list, metrics=None): # pylint: disable=invalid-name """ :param metrics: calculate ratio for specific Scylla or cassandra-stress metrics (ops, scheduler_runtime etc..). If metrics name is not defined - ration will be calculated for service_shares """ if two_users_list[0]['service_level'].service_shares > two_users_list[ 1]['service_level'].service_shares: high_shares_user = two_users_list[0] low_shares_user = two_users_list[1] else: high_shares_user = two_users_list[1] low_shares_user = two_users_list[0] if metrics: high_shares_metrics = metrics[high_shares_user['user'].name] low_shares_metrics = metrics[low_shares_user['user'].name] else: high_shares_metrics = high_shares_user[ 'service_level'].service_shares low_shares_metrics = low_shares_user[ 'service_level'].service_shares if not high_shares_metrics or not low_shares_metrics: return None return float(high_shares_metrics) / float(low_shares_metrics) def run_stress_and_verify_threads(self, params=None): read_queue = [] self._run_all_stress_cmds(read_queue, params=params) for queue in read_queue: self.verify_stress_thread(cs_thread_pool=queue) return read_queue def get_c_s_stats(self, read_queue, users, statistic_name): users_names = [user['user'].name for user in users] results = {} for i, read in enumerate(read_queue): res = self.get_stress_results(queue=read, store_results=False) stat_rate, username = None, None if res: stat_rate = res[0].get(statistic_name) username = res[0].get('username') if not (stat_rate and username): self.log.error( 'Stress statistics are not received for user {}. Can\'t complete the test' .format(users_names[i])) return None self.assertEqual( username, users_names[i], msg='Expected that stress was run with user "{}" but it was "{}"' .format(users_names[i], username)) results[username] = float(stat_rate) return results def validate_if_scylla_load_high_enough(self, start_time, wait_cpu_utilization): # pylint: disable=invalid-name end_time = int(time.time()) scylla_load = self.prometheus_stats.get_scylla_reactor_utilization( start_time=start_time, end_time=end_time) self.assertTrue( scylla_load >= wait_cpu_utilization, msg='Load isn\'t high enough. The test results may be not correct') def clean_auth(self, entities_list_of_dict): for entity in entities_list_of_dict: service_level = entity.get('service_level') role = entity.get('role') user = entity.get('user') if user: user.drop() if role: role.drop() if service_level: service_level.drop() self.backgroud_task = None self.connection_cql.cluster.shutdown() def warm_up_cache_before_test(self, max_key_for_read, stress_duration): read_cmds = [ self.STRESS_READ_CMD.format(n=self.num_of_partitions, user=self.DEFAULT_USER, password=self.DEFAULT_USER, pop="seq=1..%d" % max_key_for_read, duration='%dm' % stress_duration, threads=200) ] self.run_stress_and_verify_threads(params={'stress_cmd': read_cmds}) # pylint: disable=too-many-arguments, too-many-locals def define_read_cassandra_stress_command( self, role: Role, load_type: str, c_s_workload_type: str, threads: int, stress_duration_min: int, max_rows_for_read: int = None, stress_command: str = STRESS_READ_CMD, throttle: int = 20000, **kwargs): """ :param role: Role object :param load_type: cache_only/disk_only/mixed :param c_s_workload_type: latency: with ops restriction - using throttle or throughput: no restriction """ def latency(): return '%d throttle=%d/s' % (threads, throttle) def throughput(): # pylint: disable=unused-variable return threads def cache_only(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = int(self.num_of_partitions * 0.3) return 'seq=1..%d' % max_rows_for_read # Read from cache and disk def mixed(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = self.num_of_partitions return "'dist=gauss(1..%d, %d, %d)'" % ( max_rows_for_read, int( max_rows_for_read / 2), int(max_rows_for_read * 0.05)) def disk_only(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = int(self.num_of_partitions * 0.3) return 'seq=%d..%d' % (max_rows_for_read, max_rows_for_read + int(self.num_of_partitions * 0.25)) rate = locals()[c_s_workload_type]( ) # define -rate for c-s command depend on workload type pop = locals()[load_type]( max_rows_for_read ) # define -pop for c-s command depend on load type params = { 'n': self.num_of_partitions, 'user': role.name, 'password': role.password, 'pop': pop, 'duration': '%dm' % stress_duration_min, 'threads': rate } if kwargs: params.update(kwargs['kwargs']) c_s_cmd = stress_command.format(**params) self.log.info("Created cassandra-stress command: %s", c_s_cmd) return c_s_cmd def test_read_throughput_1to5_ratio(self): """ Basic test - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin) - Expect OPS ratio between two loads is 1:5 (e.g. 190:950) - Expect scheduler run time between two loads is 1:5 (e.g. 190:950) Load from both cache and disk """ self._two_users_load_througput_workload(shares=[190, 950], load=self.MIXED_LOAD) def _two_users_load_througput_workload(self, shares, load): session = self.prepare_schema() self.create_test_data() # Define Service Levels/Roles/Users read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) expected_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) stress_duration = 10 # minutes read_cmds = [ self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=load, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=250, stress_duration_min=stress_duration), self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=load, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=250, stress_duration_min=stress_duration) ] try: start_time = time.time() read_queue = self.run_stress_and_verify_threads(params={ 'stress_cmd': read_cmds, 'round_robin': True }) results = self.get_c_s_stats(read_queue=read_queue, users=read_users, statistic_name='op rate') self.validate_if_scylla_load_high_enough( start_time=start_time, wait_cpu_utilization=self.MIN_CPU_UTILIZATION) end_time = time.time() self.validate_scheduler_runtime( start_time=start_time, end_time=end_time, read_users=read_users, expected_ratio=expected_shares_ratio) self.assertTrue(results, msg='Not received cassandra-stress results') self.log.debug('Validate cassandra-stress ops deviation') actual_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users, metrics=results) self.validate_deviation(expected_ratio=expected_shares_ratio, actual_ratio=actual_shares_ratio, msg='Validate cassandra-stress ops.') finally: self.clean_auth(entities_list_of_dict=read_users) def test_read_throughput_vs_latency_cache_and_disk(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the full data set (that is read from both the cache and the disk) Troughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 10 # minutes shares = [190, 950] read_users = [] session = self.prepare_schema() self.create_test_data() # Define Service Levels/Roles/Users for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) # Define stress commands read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_read_throughput_vs_latency_cache_only(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the data set that fully exists in the cache Troughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 5 # minutes shares = [190, 950] # Select part of the record to warm the cache (all this data will be in the cache). # This amount of data will be read during the test from cache max_key_for_read = int(self.num_of_partitions * 0.5) read_users = [] session = self.prepare_schema() self.create_test_data() # Warm up the cache to guarantee the read will be from disk self.warm_up_cache_before_test(max_key_for_read=max_key_for_read, stress_duration=30) # Define Service Levels/Roles/Users for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.CACHE_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.CACHE_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_read_throughput_vs_latency_disk_only(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the data set that fully exists in the cache Troughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 5 # minutes session = self.prepare_schema() self.create_test_data() for node in self.db_cluster.nodes: node.stop_scylla_server(verify_up=False, verify_down=True) node.start_scylla_server(verify_up=True, verify_down=False) # Select part of the record to warm the cache (all this data will be in the cache). # cassandra-stress "-pop" parameter will start from more then "max_key_for_cache" row number # (for read from the disk) max_key_for_cache = int(self.num_of_partitions * 0.25) # Warm up the cache to guarantee the read will be from disk self.warm_up_cache_before_test(max_key_for_read=max_key_for_cache, stress_duration=30) # Define Service Levels/Roles/Users shares = [190, 950] read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.DISK_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache * 2), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.DISK_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache * 3), 'latency_only': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.DISK_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_read_50perc_write_50perc_load(self): """ Test scenario: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin) - Expect OPS ratio between two loads is 1:5 (e.g. 190:950) - Expect scheduler run time between two loads is 1:5 (e.g. 190:950) """ session = self.prepare_schema() self.create_test_data() stress_duration_min = 10 # Define Service Levels/Roles/Users shares = [190, 950] read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_workload_types(self): """ Test scenario: run 2 workload types (batch, interactive) using Roles with relevant ServiceLevel objects attached to them. Validate that the metrics differ and that the difference is within the expected margins. """ session = self.prepare_schema() self.create_test_data(rows_amount=100_000) stress_duration_min = 180 # Define Service Levels/Roles/Users interactive_role = Role(session=session, name="interactive", password="******", login=True, verbose=True).create() batch_role = Role(session=session, name="batch", password="******", login=True, verbose=True).create() interactive_sla = ServiceLevel(session=session, name="interactive", shares=None, workload_type="interactive").create() batch_sla = ServiceLevel(session=session, name="batch", shares=None, workload_type="batch").create() interactive_role.attach_service_level(interactive_sla) batch_role.attach_service_level(batch_sla) read_cmds = { 'throughput_interactive': self.define_read_cassandra_stress_command( role=interactive_role, load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), 'throughput_batch': self.define_read_cassandra_stress_command( role=batch_role, load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), } try: self.log.debug( 'Running interactive and batch workloads in sequence...') workloads_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [ read_cmds['throughput_interactive'], read_cmds["throughput_batch"], ], 'round_robin': True }) self._comparison_results = self._compare_workloads_c_s_metrics( workloads_queue) self.log.info("C-S comparison results:\n%s", self._comparison_results) self.upload_c_s_comparison_to_es() finally: pass def _throughput_latency_tests_run(self, read_cmds, read_users, latency_user): # pylint: disable=too-many-locals try: # Run latency workload test_start_time = time.time() self.log.debug('Start latency only workload') read_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [read_cmds.get('latency_only') or read_cmds['latency']], 'round_robin': True }) latency_99_for_latency_workload = self.get_c_s_stats( read_queue=read_queue, users=[latency_user], statistic_name='latency 99th percentile') self.assertTrue( latency_99_for_latency_workload, msg='Not received cassandra-stress results for latency ' 'workload') # Run throughput and latency workloads self.log.debug( 'Start latency workload in parallel with throughput workload') read_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [read_cmds['troughput'], read_cmds['latency']], 'round_robin': True }) latency_99_for_mixed_workload = self.get_c_s_stats( read_queue=read_queue, users=read_users, statistic_name='latency 99th percentile') self.assertTrue( latency_99_for_mixed_workload, msg='Not received cassandra-stress for latency workload') grafana_dataset = self.monitors.get_grafana_screenshot_and_snapshot( test_start_time=test_start_time) grafana_screenshots = grafana_dataset.get('screenshots', []) grafana_snapshots = grafana_dataset.get('snapshots', []) self.log.debug( 'GRAFANA SCREENSHOTS: {}'.format(grafana_screenshots)) self.log.debug('GRAFANA SNAPSHOTS: {}'.format(grafana_snapshots)) # Compare latency of two runs self.log.debug('Test results:\n---------------------\n') latency_99_latency_workload = latency_99_for_latency_workload[ latency_user['user'].name] latency_99_mixed_workload = latency_99_for_mixed_workload[ latency_user['user'].name] deviation = self.calculate_deviation(latency_99_latency_workload, latency_99_mixed_workload) latency_change = 'increased' if latency_99_mixed_workload > latency_99_latency_workload else 'decreased' result_print_str = '\nTest results:\n---------------------\n' result_print_str += '\nWorkload | Latency 99%' result_print_str += '\n========================= | =================' result_print_str += '\nLatency only | {}'.format( latency_99_latency_workload) result_print_str += '\nLatency and throughput | {}'.format( latency_99_mixed_workload) result_print_str += '\n------------------------- | -----------------' result_print_str += '\nLatency 99 is {} in {}%'.format( latency_change, deviation) self.log.info(result_print_str) finally: self.clean_auth(entities_list_of_dict=read_users) def _compare_workloads_c_s_metrics(self, workloads_queue: list) -> dict: comparison_axis = { "latency 95th percentile": 2.0, "latency 99th percentile": 2.0, "op rate": 2.0 } workloads_results = {} for workload in workloads_queue: result = self.get_stress_results(queue=workload, store_results=False) workloads_results.update({result[0].get("username"): result[0]}) assert len(workloads_results) == 2, \ "Expected workload_results length to be 2, got: %s. workload_results: %s" % ( len(workloads_results), workloads_results) comparison_results = {} try: for item, target_margin in comparison_axis.items(): interactive = float(workloads_results["interactive"][item]) batch = float(workloads_results["batch"][item]) ratio = interactive / batch if item == "op rate" else batch / interactive comparison_results.update({ item: { "interactive": interactive, "batch": batch, "diff": batch - interactive, "ratio": ratio, "within_margin": ratio >= target_margin } }) return comparison_results except Exception: self.log.info( "Failed to compare c-s results for batch and interactive" "workloads.") raise def upload_c_s_comparison_to_es(self) -> None: self.log.info("Uploading c-s comparison to ES...") es_body = { self.db_cluster.get_node().db_node_instance_type: { "test_id": self.test_id, "backend": self.db_cluster.params.get("cluster_backend"), "scylla_version": self.get_scylla_versions(), **self._comparison_results } } self._es.create_doc(index="workload_types", doc_type="test_stats", doc_id=self.test_id, body=es_body) self.log.info("C-s comparison uploaded to ES.") def get_email_data(self): self.log.info("Prepare data for email") email_data = {} grafana_dataset = {} try: email_data = self._get_common_email_data() except Exception as error: # pylint: disable=broad-except self.log.error("Error in gathering common email data: Error:\n%s", error) try: grafana_dataset = self.monitors.get_grafana_screenshot_and_snapshot( self.start_time) if self.monitors else {} except Exception as error: # pylint: disable=broad-except self.log.error( "Error in gathering Grafana screenshots and snapshots. Error:\n%s", error) email_data.update({ "grafana_screenshots": grafana_dataset.get("screenshots", []), "grafana_snapshots": grafana_dataset.get("snapshots", []), "scylla_ami_id": self.params.get("ami_id_db_scylla") or "-", "region": self.params.get("region_name") or "-", "workload_comparison": self._comparison_results if self._comparison_results else {} }) return email_data # pylint: disable=inconsistent-return-statements def get_test_status(self) -> str: if self._comparison_results: try: if all((item["within_margin"] for item in self._comparison_results.values())): return "SUCCESS" else: return "FAILED" except KeyError as exc: self.log.error( "Exception on attempting to check workload comparison results:\n%s", exc) return super().get_test_status()