class CommandRunner: def __init__(self, hostname, user='******', password=''): self.hostname = hostname self.user = user self.password = password self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)}) self.connection = self._create_connection() def __str__(self): return '{} [{}@{}]'.format(self.__class__.__name__, self.user, self.hostname) def run(self, cmd, timeout=None, ignore_status=False, # pylint: disable=too-many-arguments connect_timeout=300, verbose=True, log_file=None, retry=0): raise NotImplementedError("Should be implemented in subclasses") def _create_connection(self): raise NotImplementedError("_create_connection should be implemented") def _print_command_results(self, result, verbose, ignore_status): """When verbose=True and ignore_status=True that means nothing will be printed in any case""" if verbose and not result.failed: if result.stderr: self.log.info('STDERR: {}'.format(result.stderr)) self.log.info('Command "{}" finished with status {}'.format(result.command, result.exited)) return if verbose and result.failed and not ignore_status: self.log.error('Error executing command: "{}"; Exit status: {}'.format(result.command, result.exited)) if result.stdout: self.log.debug('STDOUT: {}'.format(result.stdout[-240:])) if result.stderr: self.log.debug('STDERR: {}'.format(result.stderr)) return
class CommandRunner(metaclass=ABCMeta): _params = None def __init__(self, hostname='', user='******', password=''): self.hostname = hostname self.user = user self.password = password self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)}) self.connection = self._create_connection() @abstractmethod def get_init_arguments(self) -> dict: """ Return instance parameters required to rebuild instance """ @abstractmethod def is_up(self, timeout=None) -> bool: """ Return instance parameters required to rebuild instance """ def __str__(self): return '{} [{}@{}]'.format(self.__class__.__name__, self.user, self.hostname) def _setup_watchers(self, verbose, log_file, additional_watchers): watchers = additional_watchers if additional_watchers else [] if verbose: watchers.append(OutputWatcher(self.log)) if log_file: watchers.append(LogWriteWatcher(log_file)) return watchers @abstractmethod def run(self, cmd, timeout=None, ignore_status=False, # pylint: disable=too-many-arguments verbose=True, new_session=False, log_file=None, retry=0, watchers=None): pass @abstractmethod def _create_connection(self): pass def _print_command_results(self, result, verbose, ignore_status): """When verbose=True and ignore_status=True that means nothing will be printed in any case""" if verbose and not result.failed: if result.stderr: self.log.info('STDERR: {}'.format(result.stderr)) self.log.info('Command "{}" finished with status {}'.format(result.command, result.exited)) return if verbose and result.failed and not ignore_status: self.log.error('Error executing command: "{}"; Exit status: {}'.format(result.command, result.exited)) if result.stdout: self.log.debug('STDOUT: {}'.format(result.stdout[-240:])) if result.stderr: self.log.debug('STDERR: {}'.format(result.stderr)) return
def __init__(self, node: 'BaseNode', max_core_upload_limit: int): self.node = node self.log = SDCMAdapter(node.log, extra={"prefix": self.__class__.__name__}) self.max_core_upload_limit = max_core_upload_limit self.found: List[CoreDumpInfo] = [] self.in_progress: List[CoreDumpInfo] = [] self.completed: List[CoreDumpInfo] = [] self.uploaded: List[CoreDumpInfo] = [] self.termination_event = Event() self.exception = None super().__init__(daemon=True)
def __init__(self, name, node_prefix=None, parent_cluster=None, base_logdir=None): # pylint: disable=too-many-arguments,super-init-not-called self.name = name self.node_prefix = node_prefix self.remoter = LOCALRUNNER self.remoter.receive_files = types.MethodType(send_receive_files, self) self.remoter.send_files = types.MethodType(send_receive_files, self) self.parent_cluster = parent_cluster self.is_seed = False self._distro = None self.logdir = os.path.join(base_logdir, self.name) makedirs(self.logdir) self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)})
def __init__( self, # pylint: disable=too-many-arguments name, parent_cluster, base_logdir=None, ssh_login_info=None, node_prefix=None, dc_idx=None): super(DockerMonitoringNode, self).__init__(name=name, parent_cluster=parent_cluster, base_logdir=base_logdir, ssh_login_info=ssh_login_info, node_prefix=node_prefix, dc_idx=dc_idx) self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)}) self._grafana_address = None
class CoredumpThreadBase(Thread): # pylint: disable=too-many-instance-attributes lookup_period = 30 upload_retry_limit = 3 max_coredump_thread_exceptions = 10 def __init__(self, node: 'BaseNode', max_core_upload_limit: int): self.node = node self.log = SDCMAdapter(node.log, extra={"prefix": self.__class__.__name__}) self.max_core_upload_limit = max_core_upload_limit self.found: List[CoreDumpInfo] = [] self.in_progress: List[CoreDumpInfo] = [] self.completed: List[CoreDumpInfo] = [] self.uploaded: List[CoreDumpInfo] = [] self.termination_event = Event() self.exception = None super().__init__(daemon=True) def stop(self): self.termination_event.set() @raise_event_on_failure def run(self): """ Keep reporting new coredumps found, every 30 seconds. """ exceptions_count = 0 while not self.termination_event.wait( self.lookup_period) or self.in_progress: try: self.main_cycle_body() exceptions_count = 0 except Exception as exc: # pylint: disable=broad-except self.log.error("Following error occurred: %s", exc) exceptions_count += 1 if exceptions_count == self.max_coredump_thread_exceptions: self.exception = exc raise def main_cycle_body(self): if not self.node.remoter.is_up(timeout=60): return self._process_coredumps(self.in_progress, self.completed, self.uploaded) new_cores = self.extract_info_from_core_pids(self.get_list_of_cores(), exclude_cores=self.found) self.push_new_cores_to_process(new_cores) def push_new_cores_to_process(self, new_cores: List[CoreDumpInfo]): self.found.extend(new_cores) for core_dump in new_cores: if 'bash' in core_dump.executable: continue self.log_coredump(core_dump) if not self.is_limit_reached(): self.in_progress.append(core_dump) def is_limit_reached(self): return len(self.uploaded) >= self.max_core_upload_limit def process_coredumps(self): self._process_coredumps(self.in_progress, self.completed, self.uploaded) def _process_coredumps(self, in_progress: List[CoreDumpInfo], completed: List[CoreDumpInfo], uploaded: List[CoreDumpInfo]): """ Get core files from node and report them """ if not in_progress: return for core_info in in_progress.copy(): if self.is_limit_reached(): in_progress.remove(core_info) continue try: core_info.process_retry += 1 if self.upload_retry_limit < core_info.process_retry: self.log.error( f"Maximum retry uploading is reached for core {str(core_info)}" ) in_progress.remove(core_info) completed.append(core_info) continue self.update_coredump_info_with_more_information(core_info) result = self.upload_coredump(core_info) completed.append(core_info) in_progress.remove(core_info) if result: uploaded.append(core_info) self.publish_event(core_info) except: # pylint: disable=bare-except pass @abstractmethod def get_list_of_cores(self) -> Optional[List[CoreDumpInfo]]: ... def publish_event(self, core_info: CoreDumpInfo): try: core_info.publish_event() except Exception as exc: # pylint: disable=broad-except self.log.error( f"Failed to publish coredump event due to the: {str(exc)}") def extract_info_from_core_pids( self, new_cores: Optional[List[CoreDumpInfo]], exclude_cores: List[CoreDumpInfo]) -> List[CoreDumpInfo]: output = [] for new_core_info in new_cores: found = False for e_core_info in exclude_cores: if e_core_info.pid == new_core_info.pid: found = True break if found: continue self.publish_event(new_core_info) output.append(new_core_info) return output # @retrying(n=10, sleep_time=20, allowed_exceptions=NETWORK_EXCEPTIONS, message="Retrying on uploading coredump") def _upload_coredump(self, core_info: CoreDumpInfo): coredump = core_info.corefile coredump = self._pack_coredump(coredump) base_upload_url = 'upload.scylladb.com/%s/%s' coredump_id = os.path.basename(coredump)[:-3] upload_url = base_upload_url % (coredump_id, os.path.basename(coredump)) self.log.info('Uploading coredump %s to %s' % (coredump, upload_url)) self.node.remoter.run("sudo curl --request PUT --upload-file " "'%s' '%s'" % (coredump, upload_url)) download_url = 'https://storage.cloud.google.com/%s' % upload_url self.log.info( "You can download it by %s (available for ScyllaDB employee)", download_url) download_instructions = 'gsutil cp gs://%s .\ngunzip %s' % (upload_url, coredump) core_info.download_url, core_info.download_instructions = download_url, download_instructions def upload_coredump(self, core_info: CoreDumpInfo): if core_info.download_url: return False if not core_info.corefile: self.log.error( f"{str(core_info)} has inaccessible corefile, can't upload it") return False try: self.log.debug(f'Start uploading file: {core_info.corefile}') core_info.download_instructions = 'Coredump upload in progress' self._upload_coredump(core_info) return True except Exception as exc: # pylint: disable=broad-except core_info.download_instructions = 'failed to upload core' self.log.error( f"Following error occurred during uploading coredump {core_info.corefile}: {str(exc)}" ) raise @cached_property def _is_pigz_installed(self): if self.node.is_rhel_like(): return self.node.remoter.run('yum list installed | grep pigz', ignore_status=True).ok if self.node.is_ubuntu() or self.node.is_debian(): return self.node.remoter.run('apt list --installed | grep pigz', ignore_status=True).ok raise RuntimeError("Distro is not supported") def _install_pigz(self): if self.node.is_rhel_like(): self.node.remoter.sudo('yum install -y pigz') self.__dict__['is_pigz_installed'] = True elif self.node.is_ubuntu() or self.node.is_debian(): self.node.remoter.sudo('apt install -y pigz') self.__dict__['is_pigz_installed'] = True else: raise RuntimeError("Distro is not supported") def _pack_coredump(self, coredump: str) -> str: extensions = ['.lz4', '.zip', '.gz', '.gzip'] for extension in extensions: if coredump.endswith(extension): return coredump if not self._is_pigz_installed: self._install_pigz() try: # pylint: disable=unreachable if not self.node.remoter.run(f'sudo ls {coredump}.gz', verbose=False, ignore_status=True).ok: self.node.remoter.run(f'sudo pigz --fast --keep {coredump}') coredump += '.gz' except NETWORK_EXCEPTIONS: # pylint: disable=try-except-raise raise except Exception as ex: # pylint: disable=broad-except self.log.warning("Failed to compress coredump '%s': %s", coredump, ex) return coredump def log_coredump(self, core_info: CoreDumpInfo): if not core_info.coredump_info: return log_file = os.path.join(self.node.logdir, 'coredump.log') with open(log_file, 'a') as log_file_obj: log_file_obj.write(core_info.coredump_info) for line in core_info.coredump_info.splitlines(): self.log.error(line) @property def n_coredumps(self) -> int: return len(self.found) @abstractmethod def update_coredump_info_with_more_information(self, core_info: CoreDumpInfo): pass
def __init__(self, hostname='', user='******', password=''): self.hostname = hostname self.user = user self.password = password self.log = SDCMAdapter(LOGGER, extra={'prefix': str(self)}) self.connection = self._create_connection()