Esempio n. 1
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info("Creating archive directory...")
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False))

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, "fail")
        raise
    finally:
        passed = get_status(ctx.summary) == "pass"
        if ctx.archive is not None and not (ctx.config.get("archive-on-error") and passed):
            log.info("Transferring archived files...")
            logdir = os.path.join(ctx.archive, "remote")
            if not os.path.exists(logdir):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info("Removing archive directory...")
        run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def task(ctx, config):
    """
    Go through filesystem creation with a synthetic failure in an MDS
    in its 'up:creating' state, to exercise the retry behaviour.
    """
    # Grab handles to the teuthology objects of interest
    mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
    if len(mdslist) != 1:
        # Require exactly one MDS, the code path for creation failure when
        # a standby is available is different
        raise RuntimeError("This task requires exactly one MDS")

    mds_id = mdslist[0]
    (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
    manager = ceph_manager.CephManager(
        mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
    )

    # Stop MDS
    self.fs.set_max_mds(0)
    self.fs.mds_stop(mds_id)
    self.fs.mds_fail(mds_id)

    # Reset the filesystem so that next start will go into CREATING
    manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
    manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")

    # Start the MDS with mds_kill_create_at set, it will crash during creation
    mds.restart_with_args(["--mds_kill_create_at=1"])
    try:
        mds.wait_for_exit()
    except CommandFailedError as e:
        if e.exitstatus == 1:
            log.info("MDS creation killed as expected")
        else:
            log.error("Unexpected status code %s" % e.exitstatus)
            raise

    # Since I have intentionally caused a crash, I will clean up the resulting core
    # file to avoid task.internal.coredump seeing it as a failure.
    log.info("Removing core file from synthetic MDS failure")
    mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])

    # It should have left the MDS map state still in CREATING
    status = self.fs.status().get_mds(mds_id)
    assert status['state'] == 'up:creating'

    # Start the MDS again without the kill flag set, it should proceed with creation successfully
    mds.restart()

    # Wait for state ACTIVE
    self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)

    # The system should be back up in a happy healthy state, go ahead and run any further tasks
    # inside this context.
    yield
Esempio n. 3
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo',
                'sysctl',
                '-w',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
            ],
            wait=False,
        ))

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'sysctl',
                    '-w',
                    'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            ))

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.keys():
            try:
                rem.sh("test -e " + archive_dir + "/coredump")
            except run.CommandFailedError:
                continue
            log.warning('Found coredumps on %s, flagging run as failed', rem)
            set_status(ctx.summary, 'fail')
            if 'failure_reason' not in ctx.summary:
                ctx.summary['failure_reason'] = \
                    'Found coredumps on {rem}'.format(rem=rem)
Esempio n. 4
0
 def archive_log(self):
     if not hasattr(self.ctx, 'archive') or not self.ctx.archive:
         return
     archive_dir = get_archive_dir(self.ctx)
     audit_archive = os.path.join(archive_dir, 'audit')
     mkdir_cmd = "mkdir {audit_archive}"
     cp_cmd = "sudo cp /var/log/audit/audit.log {audit_archive}"
     chown_cmd = "sudo chown $USER {audit_archive}/audit.log"
     gzip_cmd = "gzip {audit_archive}/audit.log"
     full_cmd = " && ".join((mkdir_cmd, cp_cmd, chown_cmd, gzip_cmd))
     self.cluster.run(args=full_cmd.format(audit_archive=audit_archive))
Esempio n. 5
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=['install', '-d', '-m0755', '--', archive_dir],
            wait=False,
        )
    )

    # Add logs directory to job's info log file
    misc.add_remote_path(ctx, 'init', archive_dir)

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.keys():
                path = os.path.join(logdir, rem.shortname)
                min_size_option = ctx.config.get('log-compress-min-size',
                                                 '128MB')
                try:
                    compress_min_size_bytes = \
                        humanfriendly.parse_size(min_size_option)
                except humanfriendly.InvalidSize:
                    msg = 'invalid "log-compress-min-size": {}'.format(min_size_option)
                    log.error(msg)
                    raise ConfigError(msg)
                maybe_compress = functools.partial(gzip_if_too_large,
                                                   compress_min_size_bytes)
                misc.pull_directory(rem, archive_dir, path, maybe_compress)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=['rm', '-rf', '--', archive_dir],
                wait=False,
            ),
        )
Esempio n. 6
0
 def archive_log(self):
     if not hasattr(self.ctx, "archive") or not self.ctx.archive:
         return
     archive_dir = get_archive_dir(self.ctx)
     audit_archive = os.path.join(archive_dir, "audit")
     mkdir_cmd = "mkdir {audit_archive}"
     cp_cmd = "sudo cp /var/log/audit/audit.log {audit_archive}"
     chown_cmd = "sudo chown $USER {audit_archive}/audit.log"
     gzip_cmd = "gzip {audit_archive}/audit.log"
     full_cmd = " && ".join((mkdir_cmd, cp_cmd, chown_cmd, gzip_cmd))
     self.cluster.run(args=full_cmd.format(audit_archive=audit_archive))
Esempio n. 7
0
 def setup(self):
     super(CBT, self).setup()
     self.first_mon = self.ctx.cluster.only(misc.get_first_mon(self.ctx, self.config)).remotes.keys()[0]
     self.cbt_config = self.generate_cbt_config()
     self.log.info('cbt configuration is %s', self.cbt_config)
     self.cbt_dir = os.path.join(misc.get_archive_dir(self.ctx), 'cbt')
     self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', self.cbt_dir])
     misc.write_file(self.first_mon, os.path.join(self.cbt_dir, 'cbt_config.yaml'),
                     yaml.safe_dump(self.cbt_config, default_flow_style=False))
     self.checkout_cbt()
     self.install_dependencies()
Esempio n. 8
0
 def setup(self):
     super(CBT, self).setup()
     self.first_mon = self.ctx.cluster.only(misc.get_first_mon(self.ctx, self.config)).remotes.keys()[0]
     self.cbt_config = self.generate_cbt_config()
     self.log.info('cbt configuration is %s', self.cbt_config)
     self.cbt_dir = os.path.join(misc.get_archive_dir(self.ctx), 'cbt')
     self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', self.cbt_dir])
     misc.write_file(self.first_mon, os.path.join(self.cbt_dir, 'cbt_config.yaml'),
                     yaml.safe_dump(self.cbt_config, default_flow_style=False))
     self.checkout_cbt()
     self.install_dependencies()
Esempio n. 9
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir),
            ],
            wait=False,
        )
    )

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo', 'sysctl', '-w', 'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            )
        )

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.iterkeys():
            r = rem.run(
                args=[
                    'if', 'test', '!', '-e', '{adir}/coredump'.format(adir=archive_dir), run.Raw(';'), 'then',
                    'echo', 'OK', run.Raw(';'),
                    'fi',
                ],
                stdout=StringIO(),
            )
            if r.stdout.getvalue() != 'OK\n':
                log.warning('Found coredumps on %s, flagging run as failed', rem)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        'Found coredumps on {rem}'.format(rem=rem)
Esempio n. 10
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                archive_dir,
            ],
            wait=False,
        ))

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    archive_dir,
                ],
                wait=False,
            ), )
Esempio n. 11
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--', archive_dir,
                ],
            wait=False,
            )
        )

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    archive_dir,
                    ],
                wait=False,
                ),
            )
Esempio n. 12
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = teuthology.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                archive_dir,
            ],
            wait=False,
        ))

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for remote in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, remote.shortname)
                teuthology.pull_directory(remote, archive_dir, path)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    archive_dir,
                ],
                wait=False,
            ), )
Esempio n. 13
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = teuthology.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--', archive_dir,
                ],
            wait=False,
            )
        )

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for remote in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, remote.shortname)
                teuthology.pull_directory(remote, archive_dir, path)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    archive_dir,
                    ],
                wait=False,
                ),
            )
Esempio n. 14
0
def archive(ctx, config):
    log.info("Creating archive directory...")
    archive_dir = teuthology.get_archive_dir(ctx)
    run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False))

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary["success"] = False
        raise
    finally:
        if ctx.archive is not None and not (ctx.config.get("archive-on-error") and ctx.summary["success"]):
            log.info("Transferring archived files...")
            logdir = os.path.join(ctx.archive, "remote")
            if not os.path.exists(logdir):
                os.mkdir(logdir)
            for remote in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, remote.shortname)
                teuthology.pull_directory(remote, archive_dir, path)

        log.info("Removing archive directory...")
        run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
Esempio n. 15
0
def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data
        # anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = '{adir}/syslog'.format(adir=archive_dir)
    run.wait(
        ctx.cluster.run(
            args=['mkdir', '-p', '-m0755', '--', log_dir],
            wait=False,
        ))

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir)
    misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir)
    conf_lines = [
        'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log),
        '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format(
            misc_log=misc_log),
    ]
    conf_fp = StringIO('\n'.join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            log_context = 'system_u:object_r:var_log_t:s0'
            for log_path in (kern_log, misc_log):
                rem.run(args=['install', '-m', '666', '/dev/null', log_path])
                rem.chcon(log_path, log_context)
            misc.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
            )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', rem.name)
            r = rem.run(
                args=[
                    'egrep',
                    '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'lockdep is turned off',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'trying to register non-static key',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'BUG: bad unlock balance detected',  # #6097
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'inconsistent lock state',  # FIXME see #2523
                    run.Raw('|'),
                    'grep',
                    '-v',
                    '*** DEADLOCK ***',  # part of lockdep output
                    run.Raw('|'),
                    'grep',
                    '-v',
                    # FIXME see #2590 and #147
                    'INFO: possible irq lock inversion dependency detected',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: NMI handler (perf_event_nmi_handler) took too long to run',  # noqa
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'ceph-create-keys: INFO',
                    run.Raw('|'),
                    'head',
                    '-n',
                    '1',
                ],
                stdout=StringIO(),
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', rem.name, stdout)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                ],
                wait=False,
            ), )
Esempio n. 16
0
def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info("Starting syslog monitoring...")

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = "{adir}/syslog".format(adir=archive_dir)
    run.wait(ctx.cluster.run(args=["mkdir", "-p", "-m0755", "--", log_dir], wait=False))

    CONF = "/etc/rsyslog.d/80-cephtest.conf"
    kern_log = "{log_dir}/kern.log".format(log_dir=log_dir)
    misc_log = "{log_dir}/misc.log".format(log_dir=log_dir)
    conf_lines = [
        "kern.* -{kern_log};RSYSLOG_FileFormat".format(kern_log=kern_log),
        "*.*;kern.none -{misc_log};RSYSLOG_FileFormat".format(misc_log=misc_log),
    ]
    conf_fp = StringIO("\n".join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            log_context = "system_u:object_r:var_log_t:s0"
            for log_path in (kern_log, misc_log):
                rem.run(args="touch %s" % log_path)
                rem.chcon(log_path, log_context)
            misc.sudo_write_file(remote=rem, path=CONF, data=conf_fp)
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    "sudo",
                    "service",
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    "rsyslog",
                    "restart",
                ],
                wait=False,
            )
        )

        yield
    finally:
        log.info("Shutting down syslog monitoring...")

        run.wait(
            ctx.cluster.run(
                args=["sudo", "rm", "-f", "--", CONF, run.Raw("&&"), "sudo", "service", "rsyslog", "restart"],
                wait=False,
            )
        )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info("Checking logs for errors...")
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug("Checking %s", rem.name)
            r = rem.run(
                args=[
                    "egrep",
                    "--binary-files=text",
                    "\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b",
                    run.Raw("{adir}/syslog/*.log".format(adir=archive_dir)),
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "task .* blocked for more than .* seconds",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "lockdep is turned off",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "trying to register non-static key",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "DEBUG: fsize",  # xfs_fsr
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "CRON",  # ignore cron noise
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "BUG: bad unlock balance detected",  # #6097
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "inconsistent lock state",  # FIXME see #2523
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "*** DEADLOCK ***",  # part of lockdep output
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "INFO: possible irq lock inversion dependency detected",  # FIXME see #2590 and #147
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "INFO: NMI handler (perf_event_nmi_handler) took too long to run",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "INFO: recovery required on readonly",
                    run.Raw("|"),
                    "grep",
                    "-v",
                    "ceph-create-keys: INFO",
                    run.Raw("|"),
                    "head",
                    "-n",
                    "1",
                ],
                stdout=StringIO(),
            )
            stdout = r.stdout.getvalue()
            if stdout != "":
                log.error("Error in syslog on %s: %s", rem.name, stdout)
                set_status(ctx.summary, "fail")
                if "failure_reason" not in ctx.summary:
                    ctx.summary["failure_reason"] = "'{error}' in syslog".format(error=stdout)

        log.info("Compressing syslogs...")
        run.wait(
            ctx.cluster.run(
                args=[
                    "find",
                    "{adir}/syslog".format(adir=archive_dir),
                    "-name",
                    "*.log",
                    "-print0",
                    run.Raw("|"),
                    "sudo",
                    "xargs",
                    "-0",
                    "--no-run-if-empty",
                    "--",
                    "gzip",
                    "--",
                ],
                wait=False,
            )
        )
Esempio n. 17
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info("Enabling coredump saving...")
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                "install",
                "-d",
                "-m0755",
                "--",
                "{adir}/coredump".format(adir=archive_dir),
                run.Raw("&&"),
                "sudo",
                "sysctl",
                "-w",
                "kernel.core_pattern={adir}/coredump/%t.%p.core".format(adir=archive_dir),
            ],
            wait=False,
        )
    )

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    "sudo",
                    "sysctl",
                    "-w",
                    "kernel.core_pattern=core",
                    run.Raw("&&"),
                    # don't litter the archive dir if there were no cores dumped
                    "rmdir",
                    "--ignore-fail-on-non-empty",
                    "--",
                    "{adir}/coredump".format(adir=archive_dir),
                ],
                wait=False,
            )
        )

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.iterkeys():
            r = rem.run(
                args=[
                    "if",
                    "test",
                    "!",
                    "-e",
                    "{adir}/coredump".format(adir=archive_dir),
                    run.Raw(";"),
                    "then",
                    "echo",
                    "OK",
                    run.Raw(";"),
                    "fi",
                ],
                stdout=StringIO(),
            )
            if r.stdout.getvalue() != "OK\n":
                log.warning("Found coredumps on %s, flagging run as failed", rem)
                set_status(ctx.summary, "fail")
                if "failure_reason" not in ctx.summary:
                    ctx.summary["failure_reason"] = "Found coredumps on {rem}".format(rem=rem)
Esempio n. 18
0
def task(ctx, config):
    """
    Go through filesystem creation with a synthetic failure in an MDS
    in its 'up:creating' state, to exercise the retry behaviour.
    """
    # Grab handles to the teuthology objects of interest
    mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
    if len(mdslist) != 1:
        # Require exactly one MDS, the code path for creation failure when
        # a standby is available is different
        raise RuntimeError("This task requires exactly one MDS")

    mds_id = mdslist[0]
    (mds_remote, ) = ctx.cluster.only(
        'mds.{_id}'.format(_id=mds_id)).remotes.keys()
    manager = ceph_manager.CephManager(
        mds_remote,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    # Stop MDS
    self.fs.set_max_mds(0)
    self.fs.mds_stop(mds_id)
    self.fs.mds_fail(mds_id)

    # Reset the filesystem so that next start will go into CREATING
    manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
    manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")

    # Start the MDS with mds_kill_create_at set, it will crash during creation
    mds.restart_with_args(["--mds_kill_create_at=1"])
    try:
        mds.wait_for_exit()
    except CommandFailedError as e:
        if e.exitstatus == 1:
            log.info("MDS creation killed as expected")
        else:
            log.error("Unexpected status code %s" % e.exitstatus)
            raise

    # Since I have intentionally caused a crash, I will clean up the resulting core
    # file to avoid task.internal.coredump seeing it as a failure.
    log.info("Removing core file from synthetic MDS failure")
    mds_remote.run(args=[
        'rm', '-f',
        Raw("{archive}/coredump/*.core".format(
            archive=misc.get_archive_dir(ctx)))
    ])

    # It should have left the MDS map state still in CREATING
    status = self.fs.status().get_mds(mds_id)
    assert status['state'] == 'up:creating'

    # Start the MDS again without the kill flag set, it should proceed with creation successfully
    mds.restart()

    # Wait for state ACTIVE
    self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)

    # The system should be back up in a happy healthy state, go ahead and run any further tasks
    # inside this context.
    yield
Esempio n. 19
0
def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = '{adir}/syslog'.format(adir=archive_dir)
    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir', '-p', '-m0755', '--',
                log_dir,
                ],
            wait=False,
            )
        )

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir)
    misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir)
    conf_lines = [
        'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log),
        '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format(
            misc_log=misc_log),
    ]
    conf_fp = StringIO('\n'.join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            # Exclude downburst VMs for now; they have SELinux disabled
            if rem.os.package_type == 'rpm' and not misc.is_vm(rem.shortname):
                log_context = 'system_u:object_r:var_log_t:s0'
                for log_path in (kern_log, misc_log):
                    rem.run(
                        args="touch {log} && sudo chcon {con} {log}".format(
                            log=log_path, con=log_context),
                    )
            misc.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
                )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', rem.name)
            r = rem.run(
                args=[
                    'egrep', '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep', '-v', 'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep', '-v', 'lockdep is turned off',
                    run.Raw('|'),
                    'grep', '-v', 'trying to register non-static key',
                    run.Raw('|'),
                    'grep', '-v', 'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep', '-v', 'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep', '-v', 'BUG: bad unlock balance detected', # #6097
                    run.Raw('|'),
                    'grep', '-v', 'inconsistent lock state', # FIXME see #2523
                    run.Raw('|'),
                    'grep', '-v', '*** DEADLOCK ***', # part of lockdep output
                    run.Raw('|'),
                    'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147
                    run.Raw('|'),
                    'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run',
                    run.Raw('|'),
                    'grep', '-v', 'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'head', '-n', '1',
                    ],
                stdout=StringIO(),
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', rem.name, stdout)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                    ],
                wait=False,
                ),
            )
Esempio n. 20
0
def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = teuthology.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir', '-m0755', '--',
                '{adir}/syslog'.format(adir=archive_dir),
                ],
            wait=False,
            )
        )

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    conf_fp = StringIO('''
kern.* -{adir}/syslog/kern.log;RSYSLOG_FileFormat
*.*;kern.none -{adir}/syslog/misc.log;RSYSLOG_FileFormat
'''.format(adir=archive_dir))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            teuthology.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
                )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for remote in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', remote.name)
            r = remote.run(
                args=[
                    'egrep', '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep', '-v', 'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep', '-v', 'lockdep is turned off',
                    run.Raw('|'),
                    'grep', '-v', 'trying to register non-static key',
                    run.Raw('|'),
                    'grep', '-v', 'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep', '-v', 'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep', '-v', 'BUG: bad unlock balance detected', # #6097
                    run.Raw('|'),
                    'grep', '-v', 'inconsistent lock state', # FIXME see #2523
                    run.Raw('|'),
                    'grep', '-v', '*** DEADLOCK ***', # part of lockdep output
                    run.Raw('|'),
                    'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147
                    run.Raw('|'),
                    'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run',
                    run.Raw('|'),
                    'grep', '-v', 'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'head', '-n', '1',
                    ],
                stdout=StringIO(),
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', remote.name, stdout)
                ctx.summary['success'] = False
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                    ],
                wait=False,
                ),
            )
Esempio n. 21
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    cluster = ctx.cluster.filter(lambda r: not r.is_container)
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo',
                'sysctl',
                '-w',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
                run.Raw('&&'),
                'echo',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
                run.Raw('|'),
                'sudo',
                'tee',
                '-a',
                '/etc/sysctl.conf',
            ],
            wait=False,
        ))

    try:
        yield
    finally:
        cluster = ctx.cluster.filter(lambda r: not r.is_container)
        run.wait(
            cluster.run(
                args=[
                    'sudo',
                    'sysctl',
                    '-w',
                    'kernel.core_pattern=core',
                    run.Raw('&&'),
                    'sudo',
                    'bash',
                    '-c',
                    (f'for f in `find {archive_dir}/coredump -type f`; do '
                     'file $f | grep -q systemd-sysusers && rm $f || true ; '
                     'done'),
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            ))

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in cluster.remotes.keys():
            try:
                rem.sh("test -e " + archive_dir + "/coredump")
            except run.CommandFailedError:
                continue
            log.warning('Found coredumps on %s, flagging run as failed', rem)
            set_status(ctx.summary, 'fail')
            if 'failure_reason' not in ctx.summary:
                ctx.summary['failure_reason'] = \
                    'Found coredumps on {rem}'.format(rem=rem)
Esempio n. 22
0
def syslog(ctx, config):
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = teuthology.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir',
                '-m0755',
                '--',
                '{adir}/syslog'.format(adir=archive_dir),
            ],
            wait=False,
        ))

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    conf_fp = StringIO("""
kern.* -{adir}/syslog/kern.log;RSYSLOG_FileFormat
*.*;kern.none -{adir}/syslog/misc.log;RSYSLOG_FileFormat
""".format(adir=archive_dir))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            teuthology.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
            )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for remote in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', remote.name)
            r = remote.run(
                args=[
                    'egrep',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'lockdep is turned off',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'trying to register non-static key',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'BUG: bad unlock balance detected',  # #6097
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'inconsistent lock state',  # FIXME see #2523
                    run.Raw('|'),
                    'grep',
                    '-v',
                    '*** DEADLOCK ***',  # part of lockdep output
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: possible irq lock inversion dependency detected',  # FIXME see #2590 and #147
                    run.Raw('|'),
                    'head',
                    '-n',
                    '1',
                ],
                stdout=StringIO(),
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', remote.name, stdout)
                ctx.summary['success'] = False
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                ],
                wait=False,
            ), )
Esempio n. 23
0
def setup_logging(ctx, cpar):
    cpar.set('DEFAULT', 'log_dir', teuthology.get_archive_dir(ctx))
    cpar.set('DEFAULT', 'log_file', 'tempest.log')