Python set_status Examples, teuthology.job_status.set_status Python Examples

Example #1

0

Show file

File: internal.py Project: H3C/teuthology

def check_packages(ctx, config):
    """
    Checks gitbuilder to determine if there are missing packages for this job.

    If there are missing packages, fail the job.
    """
    for task in ctx.config["tasks"]:
        if task.keys()[0] == "buildpackages":
            log.info("Checking packages skipped because " "the task buildpackages was found.")
            return

    log.info("Checking packages...")
    os_type = ctx.config.get("os_type")
    sha1 = ctx.config.get("sha1")
    # We can only do this check if there are a defined sha1 and os_type
    # in the job config.
    if os_type and sha1:
        package = GitbuilderProject("ceph", ctx.config)
        template = "Checking packages for os_type,'{os}' flavor '{flav}' and" " ceph hash '{ver}'"
        log.info(template.format(os=package.os_type, flav=package.flavor, ver=package.sha1))
        if package.version:
            log.info("Found packages for ceph version {ver}".format(ver=package.version))
        else:
            msg = "Packages for distro '{d}' and ceph hash '{ver}' not found"
            msg = msg.format(d=package.distro, ver=package.sha1)
            log.error(msg)
            # set the failure message and update paddles with the status
            ctx.summary["failure_reason"] = msg
            set_status(ctx.summary, "dead")
            report.try_push_job_info(ctx.config, dict(status="dead"))
            raise VersionNotFoundError(package.base_url)
    else:
        log.info("Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format(os=os_type, ver=sha1))

Example #2

0

Show file

File: internal.py Project: H3C/teuthology

def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info("Creating archive directory...")
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False))

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, "fail")
        raise
    finally:
        passed = get_status(ctx.summary) == "pass"
        if ctx.archive is not None and not (ctx.config.get("archive-on-error") and passed):
            log.info("Transferring archived files...")
            logdir = os.path.join(ctx.archive, "remote")
            if not os.path.exists(logdir):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info("Removing archive directory...")
        run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))

Example #3

0

Show file

File: __init__.py Project: aspineon/teuthology

def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo',
                'sysctl',
                '-w',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
            ],
            wait=False,
        ))

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'sysctl',
                    '-w',
                    'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            ))

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.keys():
            try:
                rem.sh("test -e " + archive_dir + "/coredump")
            except run.CommandFailedError:
                continue
            log.warning('Found coredumps on %s, flagging run as failed', rem)
            set_status(ctx.summary, 'fail')
            if 'failure_reason' not in ctx.summary:
                ctx.summary['failure_reason'] = \
                    'Found coredumps on {rem}'.format(rem=rem)

Example #4

0

Show file

def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=['install', '-d', '-m0755', '--', archive_dir],
            wait=False,
        )
    )

    # Add logs directory to job's info log file
    misc.add_remote_path(ctx, 'init', archive_dir)

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.keys():
                path = os.path.join(logdir, rem.shortname)
                min_size_option = ctx.config.get('log-compress-min-size',
                                                 '128MB')
                try:
                    compress_min_size_bytes = \
                        humanfriendly.parse_size(min_size_option)
                except humanfriendly.InvalidSize:
                    msg = 'invalid "log-compress-min-size": {}'.format(min_size_option)
                    log.error(msg)
                    raise ConfigError(msg)
                maybe_compress = functools.partial(gzip_if_too_large,
                                                   compress_min_size_bytes)
                misc.pull_directory(rem, archive_dir, path, maybe_compress)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=['rm', '-rf', '--', archive_dir],
                wait=False,
            ),
        )

Example #5

0

Show file

File: __init__.py Project: zenglg/teuthology

def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir),
            ],
            wait=False,
        )
    )

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo', 'sysctl', '-w', 'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            )
        )

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.iterkeys():
            r = rem.run(
                args=[
                    'if', 'test', '!', '-e', '{adir}/coredump'.format(adir=archive_dir), run.Raw(';'), 'then',
                    'echo', 'OK', run.Raw(';'),
                    'fi',
                ],
                stdout=StringIO(),
            )
            if r.stdout.getvalue() != 'OK\n':
                log.warning('Found coredumps on %s, flagging run as failed', rem)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        'Found coredumps on {rem}'.format(rem=rem)

Example #6

0

Show file

File: report.py Project: zhengqin/teuthology

    def report_job(self, run_name, job_id, job_info=None, dead=False):
        """
        Report a single job to the results server.

        :param run_name: The name of the run. The run must already exist.
        :param job_id:   The job's id
        :param job_info: The job's info dict. Optional - if not present, we
                         look at the archive.
        """
        if job_info is not None and not isinstance(job_info, dict):
            raise TypeError("job_info must be a dict")
        run_uri = "{base}/runs/{name}/jobs/".format(
            base=self.base_uri,
            name=run_name,
        )
        if job_info is None:
            job_info = self.serializer.job_info(run_name, job_id)
        if dead and get_status(job_info) is None:
            set_status(job_info, 'dead')
        job_json = json.dumps(job_info)
        headers = {'content-type': 'application/json'}
        response = self.session.post(run_uri, data=job_json, headers=headers)

        if response.status_code == 200:
            return job_id

        # This call is wrapped in a try/except because of:
        #  http://tracker.ceph.com/issues/8166
        try:
            resp_json = response.json()
        except ValueError:
            resp_json = dict()

        if resp_json:
            msg = resp_json.get('message', '')
        else:
            msg = response.text

        if msg and msg.endswith('already exists'):
            job_uri = os.path.join(run_uri, job_id, '')
            response = self.session.put(job_uri,
                                        data=job_json,
                                        headers=headers)
        elif msg:
            self.log.error(
                "POST to {uri} failed with status {status}: {msg}".format(
                    uri=run_uri,
                    status=response.status_code,
                    msg=msg,
                ))
        response.raise_for_status()

        return job_id

Example #7

0

Show file

File: __init__.py Project: zenglg/teuthology

def check_packages(ctx, config):
    """
    Checks gitbuilder to determine if there are missing packages for this job.

    If there are missing packages, fail the job.
    """
    for task in ctx.config['tasks']:
        if task.keys()[0] == 'buildpackages':
            log.info("Checking packages skipped because "
                     "the task buildpackages was found.")
            return

    log.info("Checking packages...")
    os_type = ctx.config.get("os_type")
    sha1 = ctx.config.get("sha1")
    # We can only do this check if there are a defined sha1 and os_type
    # in the job config.
    if os_type and sha1:
        package = get_builder_project()("ceph", ctx.config)
        template = "Checking packages for os_type '{os}', " \
            "flavor '{flav}' and ceph hash '{ver}'"
        log.info(
            template.format(
                os=package.os_type,
                flav=package.flavor,
                ver=package.sha1,
            )
        )
        if package.version:
            log.info("Found packages for ceph version {ver}".format(
                ver=package.version
            ))
        else:
            msg = "Packages for distro '{d}' and ceph hash '{ver}' not found"
            msg = msg.format(
                d=package.distro,
                ver=package.sha1,
            )
            log.error(msg)
            # set the failure message and update paddles with the status
            ctx.summary["failure_reason"] = msg
            set_status(ctx.summary, "dead")
            report.try_push_job_info(ctx.config, dict(status='dead'))
            raise VersionNotFoundError(package.base_url)
    else:
        log.info(
            "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format(
                os=os_type,
                ver=sha1,
            )
        )

Example #8

0

Show file

def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                archive_dir,
            ],
            wait=False,
        ))

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    archive_dir,
                ],
                wait=False,
            ), )

Example #9

0

Show file

File: internal.py Project: kawaguchi-s/teuthology

def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--', archive_dir,
                ],
            wait=False,
            )
        )

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    archive_dir,
                    ],
                wait=False,
                ),
            )

Example #10

0

Show file

File: internal.py Project: kawaguchi-s/teuthology

def check_packages(ctx, config):
    """
    Checks gitbuilder to determine if there are missing packages for this job.

    If there are missing packages, fail the job.
    """
    log.info("Checking packages...")
    sha1 = ctx.config.get("sha1")
    os_type = ctx.config.get("os_type")
    flavor = get_install_task_flavor(ctx.config)
    # We can only do this check if there are a defined sha1 and os_type
    # in the job config.
    if os_type and sha1:
        template = "Checking packages for os_type,'{os}' flavor '{flav}' and" \
            " ceph hash '{ver}'"
        log.info(
            template.format(
                os=os_type,
                flav=flavor,
                ver=sha1,
            )
        )
        if not has_packages_for_distro(sha1, os_type, flavor):
            msg = "Packages for os_type '{os}' and ceph hash '{ver}' not found"
            msg = msg.format(
                os=os_type,
                ver=sha1,
            )
            log.error(msg)
            # set the failure message and update paddles with the status
            ctx.summary["failure_reason"] = msg
            set_status(ctx.summary, "dead")
            report.try_push_job_info(ctx.config, dict(status='dead'))
            raise RuntimeError(msg)
    else:
        log.info(
            "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format(
                os=os_type,
                ver=sha1,
            )
        )

Example #11

0

Show file

def check_packages(ctx, config):
    """
    Checks gitbuilder to determine if there are missing packages for this job.

    If there are missing packages, fail the job.
    """
    log.info("Checking packages...")
    os_type = ctx.config.get("os_type", None)
    sha1 = ctx.config.get("sha1", None)
    # We can only do this check if there are a defined sha1 and os_type
    # in the job config.
    if os_type and sha1:
        log.info("Checking packages for os_type '{os}' and ceph hash '{ver}'".
                 format(
                     os=os_type,
                     ver=sha1,
                 ))
        if not has_packages_for_distro(sha1, os_type):
            msg = "Packages for os_type '{os}' and ceph hash '{ver}' not found"
            msg = msg.format(
                os=os_type,
                ver=sha1,
            )
            log.error(msg)
            # set the failure message and update paddles with the status
            ctx.summary["failure_reason"] = msg
            set_status(ctx.summary, "dead")
            report.try_push_job_info(ctx.config, dict(status='dead'))
            raise RuntimeError(msg)
    else:
        log.info(
            "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'"
            .format(
                os=os_type,
                ver=sha1,
            ))

Example #12

0

Show file

File: __init__.py Project: zmc/teuthology

def task(ctx, config):
    """
    Use pytest to recurse through this directory, finding any tests
    and then executing them with the teuthology ctx and config args.
    Your tests must follow standard pytest conventions to be discovered.
    """
    try:
        status = pytest.main(
            args=['-q', '--pyargs', __name__, 'teuthology.test'],
            plugins=[TeuthologyContextPlugin(ctx, config)])
    except Exception:
        log.exception("Saw non-test failure!")
        set_status(ctx.summary, "dead")
    else:
        if status == 0:
            log.info("OK. All tests passed!")
            set_status(ctx.summary, "pass")
        else:
            log.error("FAIL. Saw test failures...")
            set_status(ctx.summary, "fail")

Example #13

0

Show file

File: ansible.py Project: SUSE/teuthology

 def _set_status(self, status):
     set_status(self.ctx.summary, status)

Example #14

0

Show file

File: syslog.py Project: squidboylan/teuthology

def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data
        # anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = '{adir}/syslog'.format(adir=archive_dir)
    run.wait(
        ctx.cluster.run(
            args=['mkdir', '-p', '-m0755', '--', log_dir],
            wait=False,
        ))

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir)
    misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir)
    conf_lines = [
        'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log),
        '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format(
            misc_log=misc_log),
    ]
    conf_fp = StringIO('\n'.join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            log_context = 'system_u:object_r:var_log_t:s0'
            for log_path in (kern_log, misc_log):
                rem.run(args=['install', '-m', '666', '/dev/null', log_path])
                rem.chcon(log_path, log_context)
            misc.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
            )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', rem.name)
            r = rem.run(
                args=[
                    'egrep',
                    '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'lockdep is turned off',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'trying to register non-static key',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'BUG: bad unlock balance detected',  # #6097
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'inconsistent lock state',  # FIXME see #2523
                    run.Raw('|'),
                    'grep',
                    '-v',
                    '*** DEADLOCK ***',  # part of lockdep output
                    run.Raw('|'),
                    'grep',
                    '-v',
                    # FIXME see #2590 and #147
                    'INFO: possible irq lock inversion dependency detected',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: NMI handler (perf_event_nmi_handler) took too long to run',  # noqa
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'ceph-create-keys: INFO',
                    run.Raw('|'),
                    'head',
                    '-n',
                    '1',
                ],
                stdout=StringIO(),
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', rem.name, stdout)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                ],
                wait=False,
            ), )

Example #15

0

Show file

File: internal.py Project: H3C/teuthology

def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info("Starting syslog monitoring...")

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = "{adir}/syslog".format(adir=archive_dir)
    run.wait(ctx.cluster.run(args=["mkdir", "-p", "-m0755", "--", log_dir], wait=False))

    CONF = "/etc/rsyslog.d/80-cephtest.conf"
    kern_log = "{log_dir}/kern.log".format(log_dir=log_dir)
    misc_log = "{log_dir}/misc.log".format(log_dir=log_dir)
    conf_lines = [
        "kern.* -{kern_log};RSYSLOG_FileFormat".format(kern_log=kern_log),
        "*.*;kern.none -{misc_log};RSYSLOG_FileFormat".format(misc_log=misc_log),
    ]
    conf_fp = StringIO("\n".join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            log_context = "system_u:object_r:var_log_t:s0"
            for log_path in (kern_log, misc_log):
                rem.run(args="touch %s" % log_path)
                rem.chcon(log_path, log_context)
            misc.sudo_write_file(remote=rem, path=CONF, data=conf_fp)
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    "sudo",
                    "service",
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    "rsyslog",
                    "restart",
                ],
                wait=False,
            )
        )

        yield
    finally:
        log.info("Shutting down syslog monitoring...")

        run.wait(
            ctx.cluster.run(
                args=["sudo", "rm", "-f", "--", CONF, run.Raw("&&"), "sudo", "service", "rsyslog", "restart"],
                wait=False,
            )
        )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info("Checking logs for errors...")
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug("Checking %s", rem.name)
            r = rem.run(
                args=[
                    "egrep",
                    "--binary-files=text",
                    "\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b",
                    run.Raw("{adir}/syslog/*.log".format(adir=archive_dir)),
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "task .* blocked for more than .* seconds",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "lockdep is turned off",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "trying to register non-static key",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "DEBUG: fsize",  # xfs_fsr
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "CRON",  # ignore cron noise
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "BUG: bad unlock balance detected",  # #6097
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "inconsistent lock state",  # FIXME see #2523
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "*** DEADLOCK ***",  # part of lockdep output
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "INFO: possible irq lock inversion dependency detected",  # FIXME see #2590 and #147
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "INFO: NMI handler (perf_event_nmi_handler) took too long to run",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "INFO: recovery required on readonly",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "ceph-create-keys: INFO",
                    run.Raw("|"),
                    "head",
                    "-n",
                    "1",
                ],
                stdout=StringIO(),
            )
            stdout = r.stdout.getvalue()
            if stdout != "":
                log.error("Error in syslog on %s: %s", rem.name, stdout)
                set_status(ctx.summary, "fail")
                if "failure_reason" not in ctx.summary:
                    ctx.summary["failure_reason"] = "'{error}' in syslog".format(error=stdout)

        log.info("Compressing syslogs...")
        run.wait(
            ctx.cluster.run(
                args=[
                    "find",
                    "{adir}/syslog".format(adir=archive_dir),
                    "-name",
                    "*.log",
                    "-print0",
                    run.Raw("|"),
                    "sudo",
                    "xargs",
                    "-0",
                    "--no-run-if-empty",
                    "--",
                    "gzip",
                    "--",
                ],
                wait=False,
            )
        )

Example #16

0

Show file

File: internal.py Project: H3C/teuthology

def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info("Enabling coredump saving...")
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                "install",
                "-d",
                "-m0755",
                "--",
                "{adir}/coredump".format(adir=archive_dir),
                run.Raw("&&"),
                "sudo",
                "sysctl",
                "-w",
                "kernel.core_pattern={adir}/coredump/%t.%p.core".format(adir=archive_dir),
            ],
            wait=False,
        )
    )

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    "sudo",
                    "sysctl",
                    "-w",
                    "kernel.core_pattern=core",
                    run.Raw("&&"),
                    # don't litter the archive dir if there were no cores dumped
                    "rmdir",
                    "--ignore-fail-on-non-empty",
                    "--",
                    "{adir}/coredump".format(adir=archive_dir),
                ],
                wait=False,
            )
        )

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.iterkeys():
            r = rem.run(
                args=[
                    "if",
                    "test",
                    "!",
                    "-e",
                    "{adir}/coredump".format(adir=archive_dir),
                    run.Raw(";"),
                    "then",
                    "echo",
                    "OK",
                    run.Raw(";"),
                    "fi",
                ],
                stdout=StringIO(),
            )
            if r.stdout.getvalue() != "OK\n":
                log.warning("Found coredumps on %s, flagging run as failed", rem)
                set_status(ctx.summary, "fail")
                if "failure_reason" not in ctx.summary:
                    ctx.summary["failure_reason"] = "Found coredumps on {rem}".format(rem=rem)

Example #17

0

Show file

File: __init__.py Project: zmc/teuthology

def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    cluster = ctx.cluster.filter(lambda r: not r.is_container)
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo',
                'sysctl',
                '-w',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
                run.Raw('&&'),
                'echo',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
                run.Raw('|'),
                'sudo',
                'tee',
                '-a',
                '/etc/sysctl.conf',
            ],
            wait=False,
        ))

    try:
        yield
    finally:
        cluster = ctx.cluster.filter(lambda r: not r.is_container)
        run.wait(
            cluster.run(
                args=[
                    'sudo',
                    'sysctl',
                    '-w',
                    'kernel.core_pattern=core',
                    run.Raw('&&'),
                    'sudo',
                    'bash',
                    '-c',
                    (f'for f in `find {archive_dir}/coredump -type f`; do '
                     'file $f | grep -q systemd-sysusers && rm $f || true ; '
                     'done'),
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            ))

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in cluster.remotes.keys():
            try:
                rem.sh("test -e " + archive_dir + "/coredump")
            except run.CommandFailedError:
                continue
            log.warning('Found coredumps on %s, flagging run as failed', rem)
            set_status(ctx.summary, 'fail')
            if 'failure_reason' not in ctx.summary:
                ctx.summary['failure_reason'] = \
                    'Found coredumps on {rem}'.format(rem=rem)

Example #18

0

Show file

File: test_job_status.py Project: zhangrb/ceph-teuthology

 def test_set_status_dead(self):
     summary = dict()
     job_status.set_status(summary, 'dead')
     assert summary == dict(status='dead', success=False)

Example #19

0

Show file

File: test_job_status.py Project: zhangrb/ceph-teuthology

 def test_set_status_pass(self):
     summary = dict()
     job_status.set_status(summary, 'pass')
     assert summary == dict(status='pass', success=True)

Example #20

0

Show file

File: internal.py Project: kawaguchi-s/teuthology

def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir', '-p', '-m0755', '--',
                '{adir}/syslog'.format(adir=archive_dir),
                ],
            wait=False,
            )
        )

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    conf_fp = StringIO('''
kern.* -{adir}/syslog/kern.log;RSYSLOG_FileFormat
*.*;kern.none -{adir}/syslog/misc.log;RSYSLOG_FileFormat
'''.format(adir=archive_dir))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            misc.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
                )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', rem.name)
            r = rem.run(
                args=[
                    'egrep', '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep', '-v', 'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep', '-v', 'lockdep is turned off',
                    run.Raw('|'),
                    'grep', '-v', 'trying to register non-static key',
                    run.Raw('|'),
                    'grep', '-v', 'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep', '-v', 'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep', '-v', 'BUG: bad unlock balance detected', # #6097
                    run.Raw('|'),
                    'grep', '-v', 'inconsistent lock state', # FIXME see #2523
                    run.Raw('|'),
                    'grep', '-v', '*** DEADLOCK ***', # part of lockdep output
                    run.Raw('|'),
                    'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147
                    run.Raw('|'),
                    'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run',
                    run.Raw('|'),
                    'grep', '-v', 'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'head', '-n', '1',
                    ],
                stdout=StringIO(),
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', rem.name, stdout)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                    ],
                wait=False,
                ),
            )

Example #21

0

Show file

File: ansible.py Project: shyukri/teuthology

 def _set_status(self, status):
     set_status(self.ctx.summary, status)

Example #22

0

Show file

File: test_job_status.py Project: zhangrb/ceph-teuthology

 def test_set_then_get_status_dead(self):
     summary = dict()
     job_status.set_status(summary, 'dead')
     status = job_status.get_status(summary)
     assert status == 'dead'

Example #23

0

Show file

File: test_job_status.py Project: zhangrb/ceph-teuthology

 def test_set_status_none(self):
     summary = dict()
     job_status.set_status(summary, None)
     assert summary == dict()

Example #24

0

Show file

File: internal.py Project: Mirantis/teuthology

def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    total_requested = config[0]
    # We want to make sure there are always this many machines available
    reserved = teuth_config.reserve_machines
    assert isinstance(reserved, int), 'reserve_machines must be integer'
    assert (reserved >= 0), 'reserve_machines should >= 0'

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    all_locked = dict()
    requested = total_requested
    while True:
        # get a candidate list of machines
        machines = lock.list_locks(machine_type=machine_type, up=True,
                                   locked=False, count=requested + reserved)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more %s machines to be free (need %s + %s, have %s)...',
                    machine_type,
                    reserved,
                    requested,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (reserved, requested, len(machines)))

        try:
            newly_locked = lock.lock_many(ctx, requested, machine_type,
                                          ctx.owner, ctx.archive, os_type,
                                          os_version, arch)
        except Exception:
            # Lock failures should map to the 'dead' status instead of 'fail'
            set_status(ctx.summary, 'dead')
            raise
        all_locked.update(newly_locked)
        log.info(
            '{newly_locked} {mtype} machines locked this try, '
            '{total_locked}/{total_requested} locked so far'.format(
                newly_locked=len(newly_locked),
                mtype=machine_type,
                total_locked=len(all_locked),
                total_requested=total_requested,
            )
        )
        if len(all_locked) == total_requested:
            vmlist = []
            for lmach in all_locked:
                if misc.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = misc.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if lock.do_update_keys(keys_dict):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in all_locked.iterkeys():
                    stats = lockstatus.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = all_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'],
                default_flow_style=False
            ).splitlines()
            log.info('\n  '.join(['Locked targets:', ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'
        else:
            requested = requested - len(newly_locked)
            assert requested > 0, "lock_machines: requested counter went" \
                                  "negative, this shouldn't happen"

        log.info(
            "{total} machines locked ({new} new); need {more} more".format(
                total=len(all_locked), new=len(newly_locked), more=requested)
        )
        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = (
            ctx.config.get('unlock_on_failure', False)
            and not ctx.config.get('nuke-on-error', False)
        )
        if get_status(ctx.summary) == 'pass' or unlock_on_failure:
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)

Example #25

0

Show file

def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    total_requested = config[0]
    # We want to make sure there are always this many machines available
    reserved = teuth_config.reserve_machines
    assert isinstance(reserved, int), 'reserve_machines must be integer'
    assert (reserved >= 0), 'reserve_machines should >= 0'

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    all_locked = dict()
    requested = total_requested
    while True:
        # get a candidate list of machines
        machines = teuthology.lock.query.list_locks(machine_type=machine_type,
                                                    up=True,
                                                    locked=False,
                                                    count=requested + reserved)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < reserved + requested and ctx.owner.startswith(
                'scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more %s machines to be free (need %s + %s, have %s)...',
                    machine_type,
                    reserved,
                    requested,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (reserved, requested, len(machines)))

        try:
            newly_locked = teuthology.lock.ops.lock_many(
                ctx, requested, machine_type, ctx.owner, ctx.archive, os_type,
                os_version, arch)
        except Exception:
            # Lock failures should map to the 'dead' status instead of 'fail'
            set_status(ctx.summary, 'dead')
            raise
        all_locked.update(newly_locked)
        log.info('{newly_locked} {mtype} machines locked this try, '
                 '{total_locked}/{total_requested} locked so far'.format(
                     newly_locked=len(newly_locked),
                     mtype=machine_type,
                     total_locked=len(all_locked),
                     total_requested=total_requested,
                 ))
        if len(all_locked) == total_requested:
            vmlist = []
            for lmach in all_locked:
                if teuthology.lock.query.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = misc.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if teuthology.lock.ops.do_update_keys(keys_dict)[0]:
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in all_locked.keys():
                    stats = teuthology.lock.query.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = all_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'], default_flow_style=False).splitlines()
            log.info('\n  '.join([
                'Locked targets:',
            ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'
        else:
            requested = requested - len(newly_locked)
            assert requested > 0, "lock_machines: requested counter went" \
                                  "negative, this shouldn't happen"

        log.info(
            "{total} machines locked ({new} new); need {more} more".format(
                total=len(all_locked), new=len(newly_locked), more=requested))
        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = (ctx.config.get('unlock_on_failure', False)
                             and not ctx.config.get('nuke-on-error', False))
        if get_status(ctx.summary) == 'pass' or unlock_on_failure:
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].keys():
                teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner,
                                               ctx.archive)

Example #26

0

Show file

File: run_tasks.py Project: zmc/teuthology

def run_tasks(tasks, ctx):
    archive_path = ctx.config.get('archive_path')
    if archive_path:
        timer = Timer(
            path=os.path.join(archive_path, 'timing.yaml'),
            sync=True,
        )
    else:
        timer = Timer()
    stack = []
    try:
        for taskdict in tasks:
            try:
                ((taskname, config), ) = taskdict.items()
            except (ValueError, AttributeError):
                raise RuntimeError('Invalid task definition: %s' % taskdict)
            log.info('Running task %s...', taskname)
            timer.mark('%s enter' % taskname)
            manager = run_one_task(taskname, ctx=ctx, config=config)
            if hasattr(manager, '__enter__'):
                stack.append((taskname, manager))
                manager.__enter__()
    except BaseException as e:
        if isinstance(e, ConnectionLostError):
            # Prevent connection issues being flagged as failures
            set_status(ctx.summary, 'dead')
        else:
            # the status may have been set to dead, leave it as-is if so
            if not ctx.summary.get('status', '') == 'dead':
                set_status(ctx.summary, 'fail')
        if 'failure_reason' not in ctx.summary:
            ctx.summary['failure_reason'] = str(e)
        log.exception('Saw exception from tasks.')

        if teuth_config.sentry_dsn:
            sentry_sdk.init(teuth_config.sentry_dsn)
            config = deepcopy(ctx.config)

            tags = {
                'task': taskname,
                'owner': ctx.owner,
            }
            optional_tags = ('teuthology_branch', 'branch', 'suite',
                             'machine_type', 'os_type', 'os_version')
            for tag in optional_tags:
                if tag in config:
                    tags[tag] = config[tag]

            # Remove ssh keys from reported config
            if 'targets' in config:
                targets = config['targets']
                for host in targets.keys():
                    targets[host] = '<redacted>'

            job_id = ctx.config.get('job_id')
            archive_path = ctx.config.get('archive_path')
            extras = dict(config=config, )
            if job_id:
                extras['logs'] = get_http_log_path(archive_path, job_id)

            fingerprint = e.fingerprint() if hasattr(e,
                                                     'fingerprint') else None
            exc_id = sentry_sdk.capture_exception(
                error=e,
                tags=tags,
                extras=extras,
                fingerprint=fingerprint,
            )
            event_url = "{server}/?query={id}".format(
                server=teuth_config.sentry_server.strip('/'), id=exc_id)
            log.exception(" Sentry event: %s" % event_url)
            ctx.summary['sentry_event'] = event_url

        if ctx.config.get('interactive-on-error'):
            ctx.config['interactive-on-error'] = False
            from teuthology.task import interactive
            log.warning(
                'Saw failure during task execution, going into interactive mode...'
            )
            interactive.task(ctx=ctx, config=None)
        # Throughout teuthology, (x,) = y has been used to assign values
        # from yaml files where only one entry of type y is correct.  This
        # causes failures with 'too many values to unpack.'  We want to
        # fail as before, but with easier to understand error indicators.
        if isinstance(e, ValueError):
            if str(e) == 'too many values to unpack':
                emsg = 'Possible configuration error in yaml file'
                log.error(emsg)
                ctx.summary['failure_info'] = emsg
    finally:
        try:
            exc_info = sys.exc_info()
            sleep_before_teardown = ctx.config.get('sleep_before_teardown')
            if sleep_before_teardown:
                log.info('Sleeping for {} seconds before unwinding because'
                         ' --sleep-before-teardown was given...'.format(
                             sleep_before_teardown))
                notify_sleep_before_teardown(ctx, stack, sleep_before_teardown)
                time.sleep(sleep_before_teardown)
            while stack:
                taskname, manager = stack.pop()
                log.debug('Unwinding manager %s', taskname)
                timer.mark('%s exit' % taskname)
                try:
                    suppress = manager.__exit__(*exc_info)
                except Exception as e:
                    if isinstance(e, ConnectionLostError):
                        # Prevent connection issues being flagged as failures
                        set_status(ctx.summary, 'dead')
                    else:
                        set_status(ctx.summary, 'fail')
                    if 'failure_reason' not in ctx.summary:
                        ctx.summary['failure_reason'] = str(e)
                    log.exception('Manager failed: %s', taskname)

                    if exc_info == (None, None, None):
                        # if first failure is in an __exit__, we don't
                        # have exc_info set yet
                        exc_info = sys.exc_info()

                    if ctx.config.get('interactive-on-error'):
                        from teuthology.task import interactive
                        log.warning(
                            'Saw failure during task cleanup, going into interactive mode...'
                        )
                        interactive.task(ctx=ctx, config=None)
                else:
                    if suppress:
                        exc_info = (None, None, None)

            if exc_info != (None, None, None):
                log.debug('Exception was not quenched, exiting: %s: %s',
                          exc_info[0].__name__, exc_info[1])
                raise SystemExit(1)
        finally:
            # be careful about cyclic references
            del exc_info
        timer.mark("tasks complete")