def check_packages(ctx, config): """ Checks gitbuilder to determine if there are missing packages for this job. If there are missing packages, fail the job. """ for task in ctx.config["tasks"]: if task.keys()[0] == "buildpackages": log.info("Checking packages skipped because " "the task buildpackages was found.") return log.info("Checking packages...") os_type = ctx.config.get("os_type") sha1 = ctx.config.get("sha1") # We can only do this check if there are a defined sha1 and os_type # in the job config. if os_type and sha1: package = GitbuilderProject("ceph", ctx.config) template = "Checking packages for os_type,'{os}' flavor '{flav}' and" " ceph hash '{ver}'" log.info(template.format(os=package.os_type, flav=package.flavor, ver=package.sha1)) if package.version: log.info("Found packages for ceph version {ver}".format(ver=package.version)) else: msg = "Packages for distro '{d}' and ceph hash '{ver}' not found" msg = msg.format(d=package.distro, ver=package.sha1) log.error(msg) # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") report.try_push_job_info(ctx.config, dict(status="dead")) raise VersionNotFoundError(package.base_url) else: log.info("Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format(os=os_type, ver=sha1))
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info("Creating archive directory...") archive_dir = misc.get_archive_dir(ctx) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False)) try: yield except Exception: # we need to know this below set_status(ctx.summary, "fail") raise finally: passed = get_status(ctx.summary) == "pass" if ctx.archive is not None and not (ctx.config.get("archive-on-error") and passed): log.info("Transferring archived files...") logdir = os.path.join(ctx.archive, "remote") if not os.path.exists(logdir): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info("Removing archive directory...") run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info('Enabling coredump saving...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{adir}/coredump'.format(adir=archive_dir), run.Raw('&&'), 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format( adir=archive_dir), ], wait=False, )) try: yield finally: run.wait( ctx.cluster.run( args=[ 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', run.Raw('&&'), # don't litter the archive dir if there were no cores dumped 'rmdir', '--ignore-fail-on-non-empty', '--', '{adir}/coredump'.format(adir=archive_dir), ], wait=False, )) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in ctx.cluster.remotes.keys(): try: rem.sh("test -e " + archive_dir + "/coredump") except run.CommandFailedError: continue log.warning('Found coredumps on %s, flagging run as failed', rem) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem)
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=['install', '-d', '-m0755', '--', archive_dir], wait=False, ) ) # Add logs directory to job's info log file misc.add_remote_path(ctx, 'init', archive_dir) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.keys(): path = os.path.join(logdir, rem.shortname) min_size_option = ctx.config.get('log-compress-min-size', '128MB') try: compress_min_size_bytes = \ humanfriendly.parse_size(min_size_option) except humanfriendly.InvalidSize: msg = 'invalid "log-compress-min-size": {}'.format(min_size_option) log.error(msg) raise ConfigError(msg) maybe_compress = functools.partial(gzip_if_too_large, compress_min_size_bytes) misc.pull_directory(rem, archive_dir, path, maybe_compress) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=['rm', '-rf', '--', archive_dir], wait=False, ), )
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info('Enabling coredump saving...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{adir}/coredump'.format(adir=archive_dir), run.Raw('&&'), 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir), ], wait=False, ) ) try: yield finally: run.wait( ctx.cluster.run( args=[ 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', run.Raw('&&'), # don't litter the archive dir if there were no cores dumped 'rmdir', '--ignore-fail-on-non-empty', '--', '{adir}/coredump'.format(adir=archive_dir), ], wait=False, ) ) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in ctx.cluster.remotes.iterkeys(): r = rem.run( args=[ 'if', 'test', '!', '-e', '{adir}/coredump'.format(adir=archive_dir), run.Raw(';'), 'then', 'echo', 'OK', run.Raw(';'), 'fi', ], stdout=StringIO(), ) if r.stdout.getvalue() != 'OK\n': log.warning('Found coredumps on %s, flagging run as failed', rem) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem)
def report_job(self, run_name, job_id, job_info=None, dead=False): """ Report a single job to the results server. :param run_name: The name of the run. The run must already exist. :param job_id: The job's id :param job_info: The job's info dict. Optional - if not present, we look at the archive. """ if job_info is not None and not isinstance(job_info, dict): raise TypeError("job_info must be a dict") run_uri = "{base}/runs/{name}/jobs/".format( base=self.base_uri, name=run_name, ) if job_info is None: job_info = self.serializer.job_info(run_name, job_id) if dead and get_status(job_info) is None: set_status(job_info, 'dead') job_json = json.dumps(job_info) headers = {'content-type': 'application/json'} response = self.session.post(run_uri, data=job_json, headers=headers) if response.status_code == 200: return job_id # This call is wrapped in a try/except because of: # http://tracker.ceph.com/issues/8166 try: resp_json = response.json() except ValueError: resp_json = dict() if resp_json: msg = resp_json.get('message', '') else: msg = response.text if msg and msg.endswith('already exists'): job_uri = os.path.join(run_uri, job_id, '') response = self.session.put(job_uri, data=job_json, headers=headers) elif msg: self.log.error( "POST to {uri} failed with status {status}: {msg}".format( uri=run_uri, status=response.status_code, msg=msg, )) response.raise_for_status() return job_id
def check_packages(ctx, config): """ Checks gitbuilder to determine if there are missing packages for this job. If there are missing packages, fail the job. """ for task in ctx.config['tasks']: if task.keys()[0] == 'buildpackages': log.info("Checking packages skipped because " "the task buildpackages was found.") return log.info("Checking packages...") os_type = ctx.config.get("os_type") sha1 = ctx.config.get("sha1") # We can only do this check if there are a defined sha1 and os_type # in the job config. if os_type and sha1: package = get_builder_project()("ceph", ctx.config) template = "Checking packages for os_type '{os}', " \ "flavor '{flav}' and ceph hash '{ver}'" log.info( template.format( os=package.os_type, flav=package.flavor, ver=package.sha1, ) ) if package.version: log.info("Found packages for ceph version {ver}".format( ver=package.version )) else: msg = "Packages for distro '{d}' and ceph hash '{ver}' not found" msg = msg.format( d=package.distro, ver=package.sha1, ) log.error(msg) # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") report.try_push_job_info(ctx.config, dict(status='dead')) raise VersionNotFoundError(package.base_url) else: log.info( "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format( os=os_type, ver=sha1, ) )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, )) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, ) ) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def check_packages(ctx, config): """ Checks gitbuilder to determine if there are missing packages for this job. If there are missing packages, fail the job. """ log.info("Checking packages...") sha1 = ctx.config.get("sha1") os_type = ctx.config.get("os_type") flavor = get_install_task_flavor(ctx.config) # We can only do this check if there are a defined sha1 and os_type # in the job config. if os_type and sha1: template = "Checking packages for os_type,'{os}' flavor '{flav}' and" \ " ceph hash '{ver}'" log.info( template.format( os=os_type, flav=flavor, ver=sha1, ) ) if not has_packages_for_distro(sha1, os_type, flavor): msg = "Packages for os_type '{os}' and ceph hash '{ver}' not found" msg = msg.format( os=os_type, ver=sha1, ) log.error(msg) # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") report.try_push_job_info(ctx.config, dict(status='dead')) raise RuntimeError(msg) else: log.info( "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format( os=os_type, ver=sha1, ) )
def check_packages(ctx, config): """ Checks gitbuilder to determine if there are missing packages for this job. If there are missing packages, fail the job. """ log.info("Checking packages...") os_type = ctx.config.get("os_type", None) sha1 = ctx.config.get("sha1", None) # We can only do this check if there are a defined sha1 and os_type # in the job config. if os_type and sha1: log.info("Checking packages for os_type '{os}' and ceph hash '{ver}'". format( os=os_type, ver=sha1, )) if not has_packages_for_distro(sha1, os_type): msg = "Packages for os_type '{os}' and ceph hash '{ver}' not found" msg = msg.format( os=os_type, ver=sha1, ) log.error(msg) # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") report.try_push_job_info(ctx.config, dict(status='dead')) raise RuntimeError(msg) else: log.info( "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'" .format( os=os_type, ver=sha1, ))
def task(ctx, config): """ Use pytest to recurse through this directory, finding any tests and then executing them with the teuthology ctx and config args. Your tests must follow standard pytest conventions to be discovered. """ try: status = pytest.main( args=['-q', '--pyargs', __name__, 'teuthology.test'], plugins=[TeuthologyContextPlugin(ctx, config)]) except Exception: log.exception("Saw non-test failure!") set_status(ctx.summary, "dead") else: if status == 0: log.info("OK. All tests passed!") set_status(ctx.summary, "pass") else: log.error("FAIL. Saw test failures...") set_status(ctx.summary, "fail")
def _set_status(self, status): set_status(self.ctx.summary, status)
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data # anyway yield return log.info('Starting syslog monitoring...') archive_dir = misc.get_archive_dir(ctx) log_dir = '{adir}/syslog'.format(adir=archive_dir) run.wait( ctx.cluster.run( args=['mkdir', '-p', '-m0755', '--', log_dir], wait=False, )) CONF = '/etc/rsyslog.d/80-cephtest.conf' kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir) misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir) conf_lines = [ 'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log), '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format( misc_log=misc_log), ] conf_fp = StringIO('\n'.join(conf_lines)) try: for rem in ctx.cluster.remotes.iterkeys(): log_context = 'system_u:object_r:var_log_t:s0' for log_path in (kern_log, misc_log): rem.run(args=['install', '-m', '666', '/dev/null', log_path]) rem.chcon(log_path, log_context) misc.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for rem in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', rem.name) r = rem.run( args=[ 'egrep', '--binary-files=text', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', # FIXME see #2590 and #147 'INFO: possible irq lock inversion dependency detected', run.Raw('|'), 'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', # noqa run.Raw('|'), 'grep', '-v', 'INFO: recovery required on readonly', run.Raw('|'), 'grep', '-v', 'ceph-create-keys: INFO', run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', rem.name, stdout) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info("Starting syslog monitoring...") archive_dir = misc.get_archive_dir(ctx) log_dir = "{adir}/syslog".format(adir=archive_dir) run.wait(ctx.cluster.run(args=["mkdir", "-p", "-m0755", "--", log_dir], wait=False)) CONF = "/etc/rsyslog.d/80-cephtest.conf" kern_log = "{log_dir}/kern.log".format(log_dir=log_dir) misc_log = "{log_dir}/misc.log".format(log_dir=log_dir) conf_lines = [ "kern.* -{kern_log};RSYSLOG_FileFormat".format(kern_log=kern_log), "*.*;kern.none -{misc_log};RSYSLOG_FileFormat".format(misc_log=misc_log), ] conf_fp = StringIO("\n".join(conf_lines)) try: for rem in ctx.cluster.remotes.iterkeys(): log_context = "system_u:object_r:var_log_t:s0" for log_path in (kern_log, misc_log): rem.run(args="touch %s" % log_path) rem.chcon(log_path, log_context) misc.sudo_write_file(remote=rem, path=CONF, data=conf_fp) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ "sudo", "service", # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files "rsyslog", "restart", ], wait=False, ) ) yield finally: log.info("Shutting down syslog monitoring...") run.wait( ctx.cluster.run( args=["sudo", "rm", "-f", "--", CONF, run.Raw("&&"), "sudo", "service", "rsyslog", "restart"], wait=False, ) ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info("Checking logs for errors...") for rem in ctx.cluster.remotes.iterkeys(): log.debug("Checking %s", rem.name) r = rem.run( args=[ "egrep", "--binary-files=text", "\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b", run.Raw("{adir}/syslog/*.log".format(adir=archive_dir)), run.Raw("|"), "grep", "-v", "task .* blocked for more than .* seconds", run.Raw("|"), "grep", "-v", "lockdep is turned off", run.Raw("|"), "grep", "-v", "trying to register non-static key", run.Raw("|"), "grep", "-v", "DEBUG: fsize", # xfs_fsr run.Raw("|"), "grep", "-v", "CRON", # ignore cron noise run.Raw("|"), "grep", "-v", "BUG: bad unlock balance detected", # #6097 run.Raw("|"), "grep", "-v", "inconsistent lock state", # FIXME see #2523 run.Raw("|"), "grep", "-v", "*** DEADLOCK ***", # part of lockdep output run.Raw("|"), "grep", "-v", "INFO: possible irq lock inversion dependency detected", # FIXME see #2590 and #147 run.Raw("|"), "grep", "-v", "INFO: NMI handler (perf_event_nmi_handler) took too long to run", run.Raw("|"), "grep", "-v", "INFO: recovery required on readonly", run.Raw("|"), "grep", "-v", "ceph-create-keys: INFO", run.Raw("|"), "head", "-n", "1", ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != "": log.error("Error in syslog on %s: %s", rem.name, stdout) set_status(ctx.summary, "fail") if "failure_reason" not in ctx.summary: ctx.summary["failure_reason"] = "'{error}' in syslog".format(error=stdout) log.info("Compressing syslogs...") run.wait( ctx.cluster.run( args=[ "find", "{adir}/syslog".format(adir=archive_dir), "-name", "*.log", "-print0", run.Raw("|"), "sudo", "xargs", "-0", "--no-run-if-empty", "--", "gzip", "--", ], wait=False, ) )
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info("Enabling coredump saving...") archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ "install", "-d", "-m0755", "--", "{adir}/coredump".format(adir=archive_dir), run.Raw("&&"), "sudo", "sysctl", "-w", "kernel.core_pattern={adir}/coredump/%t.%p.core".format(adir=archive_dir), ], wait=False, ) ) try: yield finally: run.wait( ctx.cluster.run( args=[ "sudo", "sysctl", "-w", "kernel.core_pattern=core", run.Raw("&&"), # don't litter the archive dir if there were no cores dumped "rmdir", "--ignore-fail-on-non-empty", "--", "{adir}/coredump".format(adir=archive_dir), ], wait=False, ) ) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in ctx.cluster.remotes.iterkeys(): r = rem.run( args=[ "if", "test", "!", "-e", "{adir}/coredump".format(adir=archive_dir), run.Raw(";"), "then", "echo", "OK", run.Raw(";"), "fi", ], stdout=StringIO(), ) if r.stdout.getvalue() != "OK\n": log.warning("Found coredumps on %s, flagging run as failed", rem) set_status(ctx.summary, "fail") if "failure_reason" not in ctx.summary: ctx.summary["failure_reason"] = "Found coredumps on {rem}".format(rem=rem)
def coredump(ctx, config): """ Stash a coredump of this system if an error occurs. """ log.info('Enabling coredump saving...') cluster = ctx.cluster.filter(lambda r: not r.is_container) archive_dir = misc.get_archive_dir(ctx) run.wait( cluster.run( args=[ 'install', '-d', '-m0755', '--', '{adir}/coredump'.format(adir=archive_dir), run.Raw('&&'), 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format( adir=archive_dir), run.Raw('&&'), 'echo', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format( adir=archive_dir), run.Raw('|'), 'sudo', 'tee', '-a', '/etc/sysctl.conf', ], wait=False, )) try: yield finally: cluster = ctx.cluster.filter(lambda r: not r.is_container) run.wait( cluster.run( args=[ 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', run.Raw('&&'), 'sudo', 'bash', '-c', (f'for f in `find {archive_dir}/coredump -type f`; do ' 'file $f | grep -q systemd-sysusers && rm $f || true ; ' 'done'), run.Raw('&&'), # don't litter the archive dir if there were no cores dumped 'rmdir', '--ignore-fail-on-non-empty', '--', '{adir}/coredump'.format(adir=archive_dir), ], wait=False, )) # set status = 'fail' if the dir is still there = coredumps were # seen for rem in cluster.remotes.keys(): try: rem.sh("test -e " + archive_dir + "/coredump") except run.CommandFailedError: continue log.warning('Found coredumps on %s, flagging run as failed', rem) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem)
def test_set_status_dead(self): summary = dict() job_status.set_status(summary, 'dead') assert summary == dict(status='dead', success=False)
def test_set_status_pass(self): summary = dict() job_status.set_status(summary, 'pass') assert summary == dict(status='pass', success=True)
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info('Starting syslog monitoring...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'mkdir', '-p', '-m0755', '--', '{adir}/syslog'.format(adir=archive_dir), ], wait=False, ) ) CONF = '/etc/rsyslog.d/80-cephtest.conf' conf_fp = StringIO(''' kern.* -{adir}/syslog/kern.log;RSYSLOG_FileFormat *.*;kern.none -{adir}/syslog/misc.log;RSYSLOG_FileFormat '''.format(adir=archive_dir)) try: for rem in ctx.cluster.remotes.iterkeys(): misc.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for rem in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', rem.name) r = rem.run( args=[ 'egrep', '--binary-files=text', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147 run.Raw('|'), 'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', run.Raw('|'), 'grep', '-v', 'INFO: recovery required on readonly', run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', rem.name, stdout) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def test_set_then_get_status_dead(self): summary = dict() job_status.set_status(summary, 'dead') status = job_status.get_status(summary) assert status == 'dead'
def test_set_status_none(self): summary = dict() job_status.set_status(summary, None) assert summary == dict()
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) try: newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) except Exception: # Lock failures should map to the 'dead' status instead of 'fail' set_status(ctx.summary, 'dead') raise all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = teuthology.lock.query.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith( 'scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) try: newly_locked = teuthology.lock.ops.lock_many( ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) except Exception: # Lock failures should map to the 'dead' status instead of 'fail' set_status(ctx.summary, 'dead') raise all_locked.update(newly_locked) log.info('{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, )) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if teuthology.lock.query.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if teuthology.lock.ops.do_update_keys(keys_dict)[0]: log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.keys(): stats = teuthology.lock.query.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False).splitlines() log.info('\n '.join([ 'Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested)) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = (ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False)) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].keys(): teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def run_tasks(tasks, ctx): archive_path = ctx.config.get('archive_path') if archive_path: timer = Timer( path=os.path.join(archive_path, 'timing.yaml'), sync=True, ) else: timer = Timer() stack = [] try: for taskdict in tasks: try: ((taskname, config), ) = taskdict.items() except (ValueError, AttributeError): raise RuntimeError('Invalid task definition: %s' % taskdict) log.info('Running task %s...', taskname) timer.mark('%s enter' % taskname) manager = run_one_task(taskname, ctx=ctx, config=config) if hasattr(manager, '__enter__'): stack.append((taskname, manager)) manager.__enter__() except BaseException as e: if isinstance(e, ConnectionLostError): # Prevent connection issues being flagged as failures set_status(ctx.summary, 'dead') else: # the status may have been set to dead, leave it as-is if so if not ctx.summary.get('status', '') == 'dead': set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = str(e) log.exception('Saw exception from tasks.') if teuth_config.sentry_dsn: sentry_sdk.init(teuth_config.sentry_dsn) config = deepcopy(ctx.config) tags = { 'task': taskname, 'owner': ctx.owner, } optional_tags = ('teuthology_branch', 'branch', 'suite', 'machine_type', 'os_type', 'os_version') for tag in optional_tags: if tag in config: tags[tag] = config[tag] # Remove ssh keys from reported config if 'targets' in config: targets = config['targets'] for host in targets.keys(): targets[host] = '<redacted>' job_id = ctx.config.get('job_id') archive_path = ctx.config.get('archive_path') extras = dict(config=config, ) if job_id: extras['logs'] = get_http_log_path(archive_path, job_id) fingerprint = e.fingerprint() if hasattr(e, 'fingerprint') else None exc_id = sentry_sdk.capture_exception( error=e, tags=tags, extras=extras, fingerprint=fingerprint, ) event_url = "{server}/?query={id}".format( server=teuth_config.sentry_server.strip('/'), id=exc_id) log.exception(" Sentry event: %s" % event_url) ctx.summary['sentry_event'] = event_url if ctx.config.get('interactive-on-error'): ctx.config['interactive-on-error'] = False from teuthology.task import interactive log.warning( 'Saw failure during task execution, going into interactive mode...' ) interactive.task(ctx=ctx, config=None) # Throughout teuthology, (x,) = y has been used to assign values # from yaml files where only one entry of type y is correct. This # causes failures with 'too many values to unpack.' We want to # fail as before, but with easier to understand error indicators. if isinstance(e, ValueError): if str(e) == 'too many values to unpack': emsg = 'Possible configuration error in yaml file' log.error(emsg) ctx.summary['failure_info'] = emsg finally: try: exc_info = sys.exc_info() sleep_before_teardown = ctx.config.get('sleep_before_teardown') if sleep_before_teardown: log.info('Sleeping for {} seconds before unwinding because' ' --sleep-before-teardown was given...'.format( sleep_before_teardown)) notify_sleep_before_teardown(ctx, stack, sleep_before_teardown) time.sleep(sleep_before_teardown) while stack: taskname, manager = stack.pop() log.debug('Unwinding manager %s', taskname) timer.mark('%s exit' % taskname) try: suppress = manager.__exit__(*exc_info) except Exception as e: if isinstance(e, ConnectionLostError): # Prevent connection issues being flagged as failures set_status(ctx.summary, 'dead') else: set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = str(e) log.exception('Manager failed: %s', taskname) if exc_info == (None, None, None): # if first failure is in an __exit__, we don't # have exc_info set yet exc_info = sys.exc_info() if ctx.config.get('interactive-on-error'): from teuthology.task import interactive log.warning( 'Saw failure during task cleanup, going into interactive mode...' ) interactive.task(ctx=ctx, config=None) else: if suppress: exc_info = (None, None, None) if exc_info != (None, None, None): log.debug('Exception was not quenched, exiting: %s: %s', exc_info[0].__name__, exc_info[1]) raise SystemExit(1) finally: # be careful about cyclic references del exc_info timer.mark("tasks complete")