def _run_stress(self, node, loader_idx, cpu_idx, keyspace_idx): # pylint: disable=too-many-locals stress_cmd = self.create_stress_cmd(node, loader_idx, keyspace_idx) if self.profile: with open(self.profile, encoding="utf-8") as profile_file: LOGGER.info('Profile content:\n%s', profile_file.read()) node.remoter.send_files(self.profile, os.path.join( '/tmp', os.path.basename(self.profile)), delete_dst=True) # Get next word after `cassandra-stress' in stress_cmd. # Do it this way because stress_cmd can contain env variables before `cassandra-stress'. stress_cmd_opt = stress_cmd.split("cassandra-stress", 1)[1].split(None, 1)[0] LOGGER.info('Stress command:\n%s', stress_cmd) os.makedirs(node.logdir, exist_ok=True) log_file_name = \ os.path.join(node.logdir, f'cassandra-stress-l{loader_idx}-c{cpu_idx}-k{keyspace_idx}-{uuid.uuid4()}.log') LOGGER.debug('cassandra-stress local log: %s', log_file_name) # This tag will be output in the header of c-stress result, # we parse it to know the loader & cpu info in _parse_cs_summary(). tag = f'TAG: loader_idx:{loader_idx}-cpu_idx:{cpu_idx}-keyspace_idx:{keyspace_idx}' if self.stress_num > 1: node_cmd = f'STRESS_TEST_MARKER={self.shell_marker}; taskset -c {cpu_idx} {stress_cmd}' else: node_cmd = f'STRESS_TEST_MARKER={self.shell_marker}; {stress_cmd}' node_cmd = f'echo {tag}; {node_cmd}' result = None # disable logging for cassandra stress node.remoter.run("cp /etc/scylla/cassandra/logback-tools.xml .", ignore_status=True) with CassandraStressExporter(instance_name=node.cql_ip_address, metrics=nemesis_metrics_obj(), stress_operation=stress_cmd_opt, stress_log_filename=log_file_name, loader_idx=loader_idx, cpu_idx=cpu_idx), \ CassandraStressEventsPublisher(node=node, cs_log_filename=log_file_name) as publisher, \ CassandraStressEvent(node=node, stress_cmd=self.stress_cmd, log_file_name=log_file_name) as cs_stress_event: publisher.event_id = cs_stress_event.event_id try: result = node.remoter.run(cmd=node_cmd, timeout=self.timeout, log_file=log_file_name) except Exception as exc: # pylint: disable=broad-except cs_stress_event.severity = Severity.CRITICAL if self.stop_test_on_failure else Severity.ERROR cs_stress_event.add_error( errors=[format_stress_cmd_error(exc)]) return node, result, cs_stress_event
def __init__( self, loader_set, stress_cmd, timeout, stress_num=1, node_list=None, round_robin=False, # pylint: disable=too-many-arguments params=None): self.loader_set = loader_set self.stress_cmd = stress_cmd self.timeout = timeout self.stress_num = stress_num self.node_list = node_list if node_list else [] self.round_robin = round_robin self.params = params if params else dict() self.executor = None self.results_futures = [] self.max_workers = 0 for operation in self.collectible_ops: gauge_name = self.gauge_name(operation) if gauge_name not in self.METRICS: metrics = nemesis_metrics_obj() self.METRICS[gauge_name] = metrics.create_gauge( gauge_name, 'Gauge for ycsb metrics', ['instance', 'loader_idx', 'type'])
def _run_stress_bench(self, node, loader_idx, stress_cmd, node_list): if self.sb_mode == ScyllaBenchModes.WRITE and self.sb_workload == ScyllaBenchWorkloads.TIMESERIES: node.parent_cluster.sb_write_timeseries_ts = write_timestamp = time.time_ns( ) LOGGER.debug("Set start-time: %s", write_timestamp) stress_cmd = re.sub(r"SET_WRITE_TIMESTAMP", f"{write_timestamp}", stress_cmd) LOGGER.debug("Replaced stress command: %s", stress_cmd) elif self.sb_mode == ScyllaBenchModes.READ and self.sb_workload == ScyllaBenchWorkloads.TIMESERIES: write_timestamp = wait_for( lambda: node.parent_cluster.sb_write_timeseries_ts, step=5, timeout=30, text= 'Waiting for "scylla-bench -workload=timeseries -mode=write" been started, to pick up timestamp' ) LOGGER.debug("Found write timestamp %s", write_timestamp) stress_cmd = re.sub(r"GET_WRITE_TIMESTAMP", f"{write_timestamp}", stress_cmd) LOGGER.debug("replaced stress command %s", stress_cmd) else: LOGGER.debug("Scylla bench command: %s", stress_cmd) os.makedirs(node.logdir, exist_ok=True) log_file_name = os.path.join( node.logdir, f'scylla-bench-l{loader_idx}-{uuid.uuid4()}.log') # Select first seed node to send the scylla-bench cmds ips = node_list[0].cql_ip_address with ScyllaBenchStressExporter(instance_name=node.cql_ip_address, metrics=nemesis_metrics_obj(), stress_operation=self.sb_mode, stress_log_filename=log_file_name, loader_idx=loader_idx), \ ScyllaBenchStressEventsPublisher(node=node, sb_log_filename=log_file_name) as publisher, \ ScyllaBenchEvent(node=node, stress_cmd=self.stress_cmd, log_file_name=log_file_name) as scylla_bench_event: publisher.event_id = scylla_bench_event.event_id result = None try: result = node.remoter.run( cmd="/$HOME/go/bin/{name} -nodes {ips}".format( name=stress_cmd.strip(), ips=ips), timeout=self.timeout, log_file=log_file_name) except Exception as exc: # pylint: disable=broad-except errors_str = format_stress_cmd_error(exc) if "truncate: seastar::rpc::timeout_error" in errors_str: scylla_bench_event.severity = Severity.ERROR elif self.stop_test_on_failure: scylla_bench_event.severity = Severity.CRITICAL else: scylla_bench_event.severity = Severity.ERROR scylla_bench_event.add_error([errors_str]) return node, result
def _run_stress(self, node, loader_idx, cpu_idx, keyspace_idx): stress_cmd = self.create_stress_cmd(node, loader_idx, keyspace_idx) if self.profile: with open(self.profile) as profile_file: LOGGER.info('Profile content:\n%s', profile_file.read()) node.remoter.send_files(self.profile, os.path.join( '/tmp', os.path.basename(self.profile)), delete_dst=True) stress_cmd_opt = stress_cmd.split()[1] LOGGER.info('Stress command:\n%s', stress_cmd) log_dir = os.path.join(self.output_dir, self.loader_set.name) if not os.path.exists(log_dir): makedirs(log_dir) log_file_name = os.path.join( log_dir, f'cassandra-stress-l{loader_idx}-c{cpu_idx}-k{keyspace_idx}-{uuid.uuid4()}.log' ) LOGGER.debug('cassandra-stress local log: %s', log_file_name) # This tag will be output in the header of c-stress result, # we parse it to know the loader & cpu info in _parse_cs_summary(). tag = f'TAG: loader_idx:{loader_idx}-cpu_idx:{cpu_idx}-keyspace_idx:{keyspace_idx}' if self.stress_num > 1: node_cmd = f'STRESS_TEST_MARKER={self.shell_marker}; taskset -c {cpu_idx} {stress_cmd}' else: node_cmd = f'STRESS_TEST_MARKER={self.shell_marker}; {stress_cmd}' node_cmd = f'echo {tag}; {node_cmd}' CassandraStressEvent(type='start', node=str(node), stress_cmd=stress_cmd) with CassandraStressExporter(instance_name=node.ip_address, metrics=nemesis_metrics_obj(), cs_operation=stress_cmd_opt, cs_log_filename=log_file_name, loader_idx=loader_idx, cpu_idx=cpu_idx), \ CassandraStressEventsPublisher(node=node, cs_log_filename=log_file_name): result = node.remoter.run(cmd=node_cmd, timeout=self.timeout, ignore_status=True, log_file=log_file_name) CassandraStressEvent(type='finish', node=str(node), stress_cmd=stress_cmd, log_file_name=log_file_name) return node, result
def _run_stress(self, node, loader_idx, cpu_idx, keyspace_idx): # pylint: disable=too-many-locals stress_cmd = self.create_stress_cmd(node, loader_idx, keyspace_idx) if self.profile: with open(self.profile) as profile_file: LOGGER.info('Profile content:\n%s', profile_file.read()) node.remoter.send_files(self.profile, os.path.join( '/tmp', os.path.basename(self.profile)), delete_dst=True) stress_cmd_opt = stress_cmd.split()[1] LOGGER.info('Stress command:\n%s', stress_cmd) os.makedirs(node.logdir, exist_ok=True) log_file_name = \ os.path.join(node.logdir, f'cassandra-stress-l{loader_idx}-c{cpu_idx}-k{keyspace_idx}-{uuid.uuid4()}.log') LOGGER.debug('cassandra-stress local log: %s', log_file_name) # This tag will be output in the header of c-stress result, # we parse it to know the loader & cpu info in _parse_cs_summary(). tag = f'TAG: loader_idx:{loader_idx}-cpu_idx:{cpu_idx}-keyspace_idx:{keyspace_idx}' if self.stress_num > 1: node_cmd = f'STRESS_TEST_MARKER={self.shell_marker}; taskset -c {cpu_idx} {stress_cmd}' else: node_cmd = f'STRESS_TEST_MARKER={self.shell_marker}; {stress_cmd}' node_cmd = f'echo {tag}; {node_cmd}' result = None CassandraStressEvent.start(node=node, stress_cmd=stress_cmd).publish() with CassandraStressExporter(instance_name=node.ip_address, metrics=nemesis_metrics_obj(), stress_operation=stress_cmd_opt, stress_log_filename=log_file_name, loader_idx=loader_idx, cpu_idx=cpu_idx), \ CassandraStressEventsPublisher(node=node, cs_log_filename=log_file_name): try: result = node.remoter.run(cmd=node_cmd, timeout=self.timeout, log_file=log_file_name) except Exception as exc: event_type = CassandraStressEvent.failure if self.stop_test_on_failure else CassandraStressEvent.error event_type(node=node, stress_cmd=stress_cmd, log_file_name=log_file_name, errors=[ format_stress_cmd_error(exc), ]).publish() CassandraStressEvent.finish(node=node, stress_cmd=stress_cmd, log_file_name=log_file_name).publish() return node, result
def _run_stress(self, node, loader_idx, cpu_idx, keyspace_idx): stress_cmd = self.create_stress_cmd(node, loader_idx, keyspace_idx) if self.profile: with open(self.profile) as fp: LOGGER.info('Profile content:\n%s' % fp.read()) node.remoter.send_files( self.profile, os.path.join('/tmp', os.path.basename(self.profile))) stress_cmd_opt = stress_cmd.split()[1] LOGGER.info('Stress command:\n%s' % stress_cmd) log_dir = os.path.join(self.output_dir, self.loader_set.name) if not os.path.exists(log_dir): os.makedirs(log_dir) log_file_name = os.path.join( log_dir, 'cassandra-stress-l%s-c%s-k%s-%s.log' % (loader_idx, cpu_idx, keyspace_idx, uuid.uuid4())) LOGGER.debug('cassandra-stress local log: %s', log_file_name) # This tag will be output in the header of c-stress result, # we parse it to know the loader & cpu info in _parse_cs_summary(). tag = 'TAG: loader_idx:%s-cpu_idx:%s-keyspace_idx:%s' % ( loader_idx, cpu_idx, keyspace_idx) if self.stress_num > 1: node_cmd = 'taskset -c %s bash -c "%s"' % (cpu_idx, stress_cmd) else: node_cmd = stress_cmd node_cmd = 'echo %s; %s' % (tag, node_cmd) CassandraStressEvent(type='start', node=str(node), stress_cmd=stress_cmd) with CassandraStressExporter(instance_name=node.ip_address, metrics=nemesis_metrics_obj(), cs_operation=stress_cmd_opt, cs_log_filename=log_file_name, loader_idx=loader_idx, cpu_idx=cpu_idx), \ CassandraStressEventsPublisher(node=node, cs_log_filename=log_file_name): result = node.remoter.run(cmd=node_cmd, timeout=self.timeout, ignore_status=True, log_file=log_file_name) CassandraStressEvent(type='finish', node=str(node), stress_cmd=stress_cmd, log_file_name=log_file_name) return node, result
def run(self): events_gauge = nemesis_metrics_obj().create_gauge('sct_events_gauge', 'Gauge for sct events', ['event_type', 'type', 'severity', 'node']) for event_type, message_data in EVENTS_PROCESSES['MainDevice'].subscribe_events(stop_event=self.stop_event): events_gauge.labels(event_type, getattr(message_data, 'type', ''), message_data.severity, getattr(message_data, 'node', '')).set(message_data.timestamp)
def _run_stress_bench(self, node, loader_idx, stress_cmd, node_list): read_gap = 480 # reads starts after write, read can look before start read time to current time using several sstables stress_cmd = re.sub(r"SCT_TIME", f"{int(time.time()) - read_gap}", stress_cmd) LOGGER.debug(f"replaced stress command {stress_cmd}") ScyllaBenchEvent.start(node=node, stress_cmd=stress_cmd).publish() os.makedirs(node.logdir, exist_ok=True) log_file_name = os.path.join( node.logdir, f'scylla-bench-l{loader_idx}-{uuid.uuid4()}.log') # Select first seed node to send the scylla-bench cmds ips = node_list[0].ip_address # Find stress mode: # "scylla-bench -workload=sequential -mode=write -replication-factor=3 -partition-count=100" # "scylla-bench -workload=uniform -mode=read -replication-factor=3 -partition-count=100" found = re.search(r"-mode=(.+?) ", stress_cmd) stress_cmd_opt = found.group(1) with ScyllaBenchStressExporter(instance_name=node.ip_address, metrics=nemesis_metrics_obj(), stress_operation=stress_cmd_opt, stress_log_filename=log_file_name, loader_idx=loader_idx), \ ScyllaBenchStressEventsPublisher(node=node, sb_log_filename=log_file_name): result = None try: result = node.remoter.run( cmd="/$HOME/go/bin/{name} -nodes {ips}".format( name=stress_cmd.strip(), ips=ips), timeout=self.timeout, log_file=log_file_name) except Exception as exc: # pylint: disable=broad-except errors_str = format_stress_cmd_error(exc) if "truncate: seastar::rpc::timeout_error" in errors_str: event_type = ScyllaBenchEvent.timeout elif self.stop_test_on_failure: event_type = ScyllaBenchEvent.failure else: event_type = ScyllaBenchEvent.error event_type( node=node, stress_cmd=stress_cmd, log_file_name=log_file_name, errors=[ errors_str, ], ).publish() else: ScyllaBenchEvent.finish(node=node, stress_cmd=stress_cmd, log_file_name=log_file_name).publish() return node, result
def __init__(self, loader_node, loader_idx, ycsb_log_filename): super().__init__() self.loader_node = loader_node self.loader_idx = loader_idx self.ycsb_log_filename = ycsb_log_filename self.uuid = generate_random_string(10) for operation in self.collectible_ops: gauge_name = self.gauge_name(operation) if gauge_name not in self.METRICS: metrics = nemesis_metrics_obj() self.METRICS[gauge_name] = metrics.create_gauge( gauge_name, 'Gauge for ycsb metrics', ['instance', 'loader_idx', 'uuid', 'type'])
def __init__(self, loader_node, loader_idx, ndbench_log_filename): super().__init__() self.loader_node = loader_node self.loader_idx = loader_idx self.ndbench_log_filename = ndbench_log_filename for operation in self.collectible_ops: gauge_name = self.gauge_name(operation) if gauge_name not in self.METRICS: metrics = nemesis_metrics_obj() self.METRICS[gauge_name] = metrics.create_gauge( gauge_name, 'Gauge for ndbench metrics', ['instance', 'loader_idx', 'type'])
def run(self) -> None: events_gauge = \ nemesis_metrics_obj().create_gauge("sct_events_gauge", "Gauge for SCT events", ["event_type", "type", "subtype", "severity", "node", ]) for event_tuple in self.inbound_events(): with verbose_suppress("PrometheusDumper failed to process %s", event_tuple): event_class, event = event_tuple # try to unpack event from EventsDevice events_gauge.labels( event_class, # pylint: disable=no-member getattr(event, "type", ""), getattr(event, "subtype", ""), event.severity, getattr(event, "node", "")).set(event.timestamp)
def _run_stress_harry(self, node, loader_idx, stress_cmd, node_list): CassandraHarryEvent.start(node=node, stress_cmd=stress_cmd).publish() os.makedirs(node.logdir, exist_ok=True) log_file_name = os.path.join( node.logdir, f'cassandra-harry-l{loader_idx}-{uuid.uuid4()}.log') # Select first seed node to send the scylla-harry cmds ip = node_list[0].private_ip_address with CassandraHarryStressExporter(instance_name=node.ip_address, metrics=nemesis_metrics_obj(), stress_operation='write', stress_log_filename=log_file_name, loader_idx=loader_idx), \ CassandraHarryStressEventsPublisher(node=node, harry_log_filename=log_file_name): result = None try: result = node.remoter.run(cmd=f"{stress_cmd} -node {ip}", timeout=self.timeout, log_file=log_file_name) except Exception as exc: # pylint: disable=broad-except errors_str = format_stress_cmd_error(exc) if "timeout" in errors_str: event_type = CassandraHarryEvent.timeout elif self.stop_test_on_failure: event_type = CassandraHarryEvent.failure else: event_type = CassandraHarryEvent.error event_type( node=node, stress_cmd=stress_cmd, log_file_name=log_file_name, errors=[ errors_str, ], ).publish() else: CassandraHarryEvent.finish( node=node, stress_cmd=stress_cmd, log_file_name=log_file_name).publish() return node, result
def setUpClass(cls): cls.prom_address = start_metrics_server() cls.metrics = nemesis_metrics_obj()
def __init__(self, *args, **kwargs): super(GrowClusterTest, self).__init__(*args, **kwargs) self._cluster_starting_size = self.params.get('n_db_nodes') self._cluster_target_size = self.params.get('cluster_target_size') self.metrics_srv = prometheus.nemesis_metrics_obj()
def __init__(self, *args, **kwargs): super(GrowClusterTest, self).__init__(*args, **kwargs) self._cluster_starting_size = self.params.get('n_db_nodes', default=3) self._cluster_target_size = self.params.get('cluster_target_size', default=5) self.metrics_srv = prometheus.nemesis_metrics_obj()