def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info("Creating archive directory...") archive_dir = misc.get_archive_dir(ctx) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False)) try: yield except Exception: # we need to know this below set_status(ctx.summary, "fail") raise finally: passed = get_status(ctx.summary) == "pass" if ctx.archive is not None and not (ctx.config.get("archive-on-error") and passed): log.info("Transferring archived files...") logdir = os.path.join(ctx.archive, "remote") if not os.path.exists(logdir): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info("Removing archive directory...") run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def task(ctx, config): """ Go through filesystem creation with a synthetic failure in an MDS in its 'up:creating' state, to exercise the retry behaviour. """ # Grab handles to the teuthology objects of interest mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds')) if len(mdslist) != 1: # Require exactly one MDS, the code path for creation failure when # a standby is available is different raise RuntimeError("This task requires exactly one MDS") mds_id = mdslist[0] (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys() manager = ceph_manager.CephManager( mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'), ) # Stop MDS self.fs.set_max_mds(0) self.fs.mds_stop(mds_id) self.fs.mds_fail(mds_id) # Reset the filesystem so that next start will go into CREATING manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it") manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data") # Start the MDS with mds_kill_create_at set, it will crash during creation mds.restart_with_args(["--mds_kill_create_at=1"]) try: mds.wait_for_exit() except CommandFailedError as e: if e.exitstatus == 1: log.info("MDS creation killed as expected") else: log.error("Unexpected status code %s" % e.exitstatus) raise # Since I have intentionally caused a crash, I will clean up the resulting core # file to avoid task.internal.coredump seeing it as a failure. log.info("Removing core file from synthetic MDS failure") mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))]) # It should have left the MDS map state still in CREATING status = self.fs.status().get_mds(mds_id) assert status['state'] == 'up:creating' # Start the MDS again without the kill flag set, it should proceed with creation successfully mds.restart() # Wait for state ACTIVE self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id) # The system should be back up in a happy healthy state, go ahead and run any further tasks # inside this context. yield
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info('Enabling coredump saving...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{adir}/coredump'.format(adir=archive_dir), run.Raw('&&'), 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format( adir=archive_dir), ], wait=False, )) try: yield finally: run.wait( ctx.cluster.run( args=[ 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', run.Raw('&&'), # don't litter the archive dir if there were no cores dumped 'rmdir', '--ignore-fail-on-non-empty', '--', '{adir}/coredump'.format(adir=archive_dir), ], wait=False, )) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in ctx.cluster.remotes.keys(): try: rem.sh("test -e " + archive_dir + "/coredump") except run.CommandFailedError: continue log.warning('Found coredumps on %s, flagging run as failed', rem) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem)
def archive_log(self): if not hasattr(self.ctx, 'archive') or not self.ctx.archive: return archive_dir = get_archive_dir(self.ctx) audit_archive = os.path.join(archive_dir, 'audit') mkdir_cmd = "mkdir {audit_archive}" cp_cmd = "sudo cp /var/log/audit/audit.log {audit_archive}" chown_cmd = "sudo chown $USER {audit_archive}/audit.log" gzip_cmd = "gzip {audit_archive}/audit.log" full_cmd = " && ".join((mkdir_cmd, cp_cmd, chown_cmd, gzip_cmd)) self.cluster.run(args=full_cmd.format(audit_archive=audit_archive))
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=['install', '-d', '-m0755', '--', archive_dir], wait=False, ) ) # Add logs directory to job's info log file misc.add_remote_path(ctx, 'init', archive_dir) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.keys(): path = os.path.join(logdir, rem.shortname) min_size_option = ctx.config.get('log-compress-min-size', '128MB') try: compress_min_size_bytes = \ humanfriendly.parse_size(min_size_option) except humanfriendly.InvalidSize: msg = 'invalid "log-compress-min-size": {}'.format(min_size_option) log.error(msg) raise ConfigError(msg) maybe_compress = functools.partial(gzip_if_too_large, compress_min_size_bytes) misc.pull_directory(rem, archive_dir, path, maybe_compress) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=['rm', '-rf', '--', archive_dir], wait=False, ), )
def archive_log(self): if not hasattr(self.ctx, "archive") or not self.ctx.archive: return archive_dir = get_archive_dir(self.ctx) audit_archive = os.path.join(archive_dir, "audit") mkdir_cmd = "mkdir {audit_archive}" cp_cmd = "sudo cp /var/log/audit/audit.log {audit_archive}" chown_cmd = "sudo chown $USER {audit_archive}/audit.log" gzip_cmd = "gzip {audit_archive}/audit.log" full_cmd = " && ".join((mkdir_cmd, cp_cmd, chown_cmd, gzip_cmd)) self.cluster.run(args=full_cmd.format(audit_archive=audit_archive))
def setup(self): super(CBT, self).setup() self.first_mon = self.ctx.cluster.only(misc.get_first_mon(self.ctx, self.config)).remotes.keys()[0] self.cbt_config = self.generate_cbt_config() self.log.info('cbt configuration is %s', self.cbt_config) self.cbt_dir = os.path.join(misc.get_archive_dir(self.ctx), 'cbt') self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', self.cbt_dir]) misc.write_file(self.first_mon, os.path.join(self.cbt_dir, 'cbt_config.yaml'), yaml.safe_dump(self.cbt_config, default_flow_style=False)) self.checkout_cbt() self.install_dependencies()
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info('Enabling coredump saving...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{adir}/coredump'.format(adir=archive_dir), run.Raw('&&'), 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir), ], wait=False, ) ) try: yield finally: run.wait( ctx.cluster.run( args=[ 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', run.Raw('&&'), # don't litter the archive dir if there were no cores dumped 'rmdir', '--ignore-fail-on-non-empty', '--', '{adir}/coredump'.format(adir=archive_dir), ], wait=False, ) ) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in ctx.cluster.remotes.iterkeys(): r = rem.run( args=[ 'if', 'test', '!', '-e', '{adir}/coredump'.format(adir=archive_dir), run.Raw(';'), 'then', 'echo', 'OK', run.Raw(';'), 'fi', ], stdout=StringIO(), ) if r.stdout.getvalue() != 'OK\n': log.warning('Found coredumps on %s, flagging run as failed', rem) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem)
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, )) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, ) ) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = teuthology.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, )) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, archive_dir, path) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = teuthology.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, ) ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, archive_dir, path) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): log.info("Creating archive directory...") archive_dir = teuthology.get_archive_dir(ctx) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False)) try: yield except Exception: # we need to know this below ctx.summary["success"] = False raise finally: if ctx.archive is not None and not (ctx.config.get("archive-on-error") and ctx.summary["success"]): log.info("Transferring archived files...") logdir = os.path.join(ctx.archive, "remote") if not os.path.exists(logdir): os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, archive_dir, path) log.info("Removing archive directory...") run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data # anyway yield return log.info('Starting syslog monitoring...') archive_dir = misc.get_archive_dir(ctx) log_dir = '{adir}/syslog'.format(adir=archive_dir) run.wait( ctx.cluster.run( args=['mkdir', '-p', '-m0755', '--', log_dir], wait=False, )) CONF = '/etc/rsyslog.d/80-cephtest.conf' kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir) misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir) conf_lines = [ 'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log), '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format( misc_log=misc_log), ] conf_fp = StringIO('\n'.join(conf_lines)) try: for rem in ctx.cluster.remotes.iterkeys(): log_context = 'system_u:object_r:var_log_t:s0' for log_path in (kern_log, misc_log): rem.run(args=['install', '-m', '666', '/dev/null', log_path]) rem.chcon(log_path, log_context) misc.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for rem in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', rem.name) r = rem.run( args=[ 'egrep', '--binary-files=text', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', # FIXME see #2590 and #147 'INFO: possible irq lock inversion dependency detected', run.Raw('|'), 'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', # noqa run.Raw('|'), 'grep', '-v', 'INFO: recovery required on readonly', run.Raw('|'), 'grep', '-v', 'ceph-create-keys: INFO', run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', rem.name, stdout) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info("Starting syslog monitoring...") archive_dir = misc.get_archive_dir(ctx) log_dir = "{adir}/syslog".format(adir=archive_dir) run.wait(ctx.cluster.run(args=["mkdir", "-p", "-m0755", "--", log_dir], wait=False)) CONF = "/etc/rsyslog.d/80-cephtest.conf" kern_log = "{log_dir}/kern.log".format(log_dir=log_dir) misc_log = "{log_dir}/misc.log".format(log_dir=log_dir) conf_lines = [ "kern.* -{kern_log};RSYSLOG_FileFormat".format(kern_log=kern_log), "*.*;kern.none -{misc_log};RSYSLOG_FileFormat".format(misc_log=misc_log), ] conf_fp = StringIO("\n".join(conf_lines)) try: for rem in ctx.cluster.remotes.iterkeys(): log_context = "system_u:object_r:var_log_t:s0" for log_path in (kern_log, misc_log): rem.run(args="touch %s" % log_path) rem.chcon(log_path, log_context) misc.sudo_write_file(remote=rem, path=CONF, data=conf_fp) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ "sudo", "service", # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files "rsyslog", "restart", ], wait=False, ) ) yield finally: log.info("Shutting down syslog monitoring...") run.wait( ctx.cluster.run( args=["sudo", "rm", "-f", "--", CONF, run.Raw("&&"), "sudo", "service", "rsyslog", "restart"], wait=False, ) ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info("Checking logs for errors...") for rem in ctx.cluster.remotes.iterkeys(): log.debug("Checking %s", rem.name) r = rem.run( args=[ "egrep", "--binary-files=text", "\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b", run.Raw("{adir}/syslog/*.log".format(adir=archive_dir)), run.Raw("|"), "grep", "-v", "task .* blocked for more than .* seconds", run.Raw("|"), "grep", "-v", "lockdep is turned off", run.Raw("|"), "grep", "-v", "trying to register non-static key", run.Raw("|"), "grep", "-v", "DEBUG: fsize", # xfs_fsr run.Raw("|"), "grep", "-v", "CRON", # ignore cron noise run.Raw("|"), "grep", "-v", "BUG: bad unlock balance detected", # #6097 run.Raw("|"), "grep", "-v", "inconsistent lock state", # FIXME see #2523 run.Raw("|"), "grep", "-v", "*** DEADLOCK ***", # part of lockdep output run.Raw("|"), "grep", "-v", "INFO: possible irq lock inversion dependency detected", # FIXME see #2590 and #147 run.Raw("|"), "grep", "-v", "INFO: NMI handler (perf_event_nmi_handler) took too long to run", run.Raw("|"), "grep", "-v", "INFO: recovery required on readonly", run.Raw("|"), "grep", "-v", "ceph-create-keys: INFO", run.Raw("|"), "head", "-n", "1", ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != "": log.error("Error in syslog on %s: %s", rem.name, stdout) set_status(ctx.summary, "fail") if "failure_reason" not in ctx.summary: ctx.summary["failure_reason"] = "'{error}' in syslog".format(error=stdout) log.info("Compressing syslogs...") run.wait( ctx.cluster.run( args=[ "find", "{adir}/syslog".format(adir=archive_dir), "-name", "*.log", "-print0", run.Raw("|"), "sudo", "xargs", "-0", "--no-run-if-empty", "--", "gzip", "--", ], wait=False, ) )
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info("Enabling coredump saving...") archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ "install", "-d", "-m0755", "--", "{adir}/coredump".format(adir=archive_dir), run.Raw("&&"), "sudo", "sysctl", "-w", "kernel.core_pattern={adir}/coredump/%t.%p.core".format(adir=archive_dir), ], wait=False, ) ) try: yield finally: run.wait( ctx.cluster.run( args=[ "sudo", "sysctl", "-w", "kernel.core_pattern=core", run.Raw("&&"), # don't litter the archive dir if there were no cores dumped "rmdir", "--ignore-fail-on-non-empty", "--", "{adir}/coredump".format(adir=archive_dir), ], wait=False, ) ) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in ctx.cluster.remotes.iterkeys(): r = rem.run( args=[ "if", "test", "!", "-e", "{adir}/coredump".format(adir=archive_dir), run.Raw(";"), "then", "echo", "OK", run.Raw(";"), "fi", ], stdout=StringIO(), ) if r.stdout.getvalue() != "OK\n": log.warning("Found coredumps on %s, flagging run as failed", rem) set_status(ctx.summary, "fail") if "failure_reason" not in ctx.summary: ctx.summary["failure_reason"] = "Found coredumps on {rem}".format(rem=rem)
def task(ctx, config): """ Go through filesystem creation with a synthetic failure in an MDS in its 'up:creating' state, to exercise the retry behaviour. """ # Grab handles to the teuthology objects of interest mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds')) if len(mdslist) != 1: # Require exactly one MDS, the code path for creation failure when # a standby is available is different raise RuntimeError("This task requires exactly one MDS") mds_id = mdslist[0] (mds_remote, ) = ctx.cluster.only( 'mds.{_id}'.format(_id=mds_id)).remotes.keys() manager = ceph_manager.CephManager( mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'), ) # Stop MDS self.fs.set_max_mds(0) self.fs.mds_stop(mds_id) self.fs.mds_fail(mds_id) # Reset the filesystem so that next start will go into CREATING manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it") manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data") # Start the MDS with mds_kill_create_at set, it will crash during creation mds.restart_with_args(["--mds_kill_create_at=1"]) try: mds.wait_for_exit() except CommandFailedError as e: if e.exitstatus == 1: log.info("MDS creation killed as expected") else: log.error("Unexpected status code %s" % e.exitstatus) raise # Since I have intentionally caused a crash, I will clean up the resulting core # file to avoid task.internal.coredump seeing it as a failure. log.info("Removing core file from synthetic MDS failure") mds_remote.run(args=[ 'rm', '-f', Raw("{archive}/coredump/*.core".format( archive=misc.get_archive_dir(ctx))) ]) # It should have left the MDS map state still in CREATING status = self.fs.status().get_mds(mds_id) assert status['state'] == 'up:creating' # Start the MDS again without the kill flag set, it should proceed with creation successfully mds.restart() # Wait for state ACTIVE self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id) # The system should be back up in a happy healthy state, go ahead and run any further tasks # inside this context. yield
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info('Starting syslog monitoring...') archive_dir = misc.get_archive_dir(ctx) log_dir = '{adir}/syslog'.format(adir=archive_dir) run.wait( ctx.cluster.run( args=[ 'mkdir', '-p', '-m0755', '--', log_dir, ], wait=False, ) ) CONF = '/etc/rsyslog.d/80-cephtest.conf' kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir) misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir) conf_lines = [ 'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log), '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format( misc_log=misc_log), ] conf_fp = StringIO('\n'.join(conf_lines)) try: for rem in ctx.cluster.remotes.iterkeys(): # Exclude downburst VMs for now; they have SELinux disabled if rem.os.package_type == 'rpm' and not misc.is_vm(rem.shortname): log_context = 'system_u:object_r:var_log_t:s0' for log_path in (kern_log, misc_log): rem.run( args="touch {log} && sudo chcon {con} {log}".format( log=log_path, con=log_context), ) misc.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for rem in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', rem.name) r = rem.run( args=[ 'egrep', '--binary-files=text', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147 run.Raw('|'), 'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', run.Raw('|'), 'grep', '-v', 'INFO: recovery required on readonly', run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', rem.name, stdout) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info('Starting syslog monitoring...') archive_dir = teuthology.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'mkdir', '-m0755', '--', '{adir}/syslog'.format(adir=archive_dir), ], wait=False, ) ) CONF = '/etc/rsyslog.d/80-cephtest.conf' conf_fp = StringIO(''' kern.* -{adir}/syslog/kern.log;RSYSLOG_FileFormat *.*;kern.none -{adir}/syslog/misc.log;RSYSLOG_FileFormat '''.format(adir=archive_dir)) try: for rem in ctx.cluster.remotes.iterkeys(): teuthology.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for remote in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', remote.name) r = remote.run( args=[ 'egrep', '--binary-files=text', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147 run.Raw('|'), 'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', run.Raw('|'), 'grep', '-v', 'INFO: recovery required on readonly', run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', remote.name, stdout) ctx.summary['success'] = False if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info('Enabling coredump saving...') cluster = ctx.cluster.filter(lambda r: not r.is_container) archive_dir = misc.get_archive_dir(ctx) run.wait( cluster.run( args=[ 'install', '-d', '-m0755', '--', '{adir}/coredump'.format(adir=archive_dir), run.Raw('&&'), 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format( adir=archive_dir), run.Raw('&&'), 'echo', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format( adir=archive_dir), run.Raw('|'), 'sudo', 'tee', '-a', '/etc/sysctl.conf', ], wait=False, )) try: yield finally: cluster = ctx.cluster.filter(lambda r: not r.is_container) run.wait( cluster.run( args=[ 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', run.Raw('&&'), 'sudo', 'bash', '-c', (f'for f in `find {archive_dir}/coredump -type f`; do ' 'file $f | grep -q systemd-sysusers && rm $f || true ; ' 'done'), run.Raw('&&'), # don't litter the archive dir if there were no cores dumped 'rmdir', '--ignore-fail-on-non-empty', '--', '{adir}/coredump'.format(adir=archive_dir), ], wait=False, )) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in cluster.remotes.keys(): try: rem.sh("test -e " + archive_dir + "/coredump") except run.CommandFailedError: continue log.warning('Found coredumps on %s, flagging run as failed', rem) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem)
def syslog(ctx, config): if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info('Starting syslog monitoring...') archive_dir = teuthology.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'mkdir', '-m0755', '--', '{adir}/syslog'.format(adir=archive_dir), ], wait=False, )) CONF = '/etc/rsyslog.d/80-cephtest.conf' conf_fp = StringIO(""" kern.* -{adir}/syslog/kern.log;RSYSLOG_FileFormat *.*;kern.none -{adir}/syslog/misc.log;RSYSLOG_FileFormat """.format(adir=archive_dir)) try: for rem in ctx.cluster.remotes.iterkeys(): teuthology.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for remote in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', remote.name) r = remote.run( args=[ 'egrep', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147 run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', remote.name, stdout) ctx.summary['success'] = False if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def setup_logging(ctx, cpar): cpar.set('DEFAULT', 'log_dir', teuthology.get_archive_dir(ctx)) cpar.set('DEFAULT', 'log_file', 'tempest.log')