Ejemplo n.º 1
0
def vm_setup(ctx, config):
    """
    Look for virtual machines and handle their initialization
    """
    all_tasks = [x.keys()[0] for x in ctx.config['tasks']]
    need_ansible = False
    if 'kernel' in all_tasks and 'ansible.cephlab' not in all_tasks:
        need_ansible = True
    ansible_hosts = set()
    with parallel():
        editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh')
        for rem in ctx.cluster.remotes.iterkeys():
            if misc.is_vm(rem.shortname):
                ansible_hosts.add(rem.shortname)
                r = rem.run(args=['test', '-e', '/ceph-qa-ready'],
                            stdout=StringIO(),
                            check_status=False)
                if r.returncode != 0:
                    p1 = subprocess.Popen(['cat', editinfo],
                                          stdout=subprocess.PIPE)
                    p2 = subprocess.Popen([
                        'ssh', '-o', 'StrictHostKeyChecking=no', '-t', '-t',
                        str(rem), 'sudo', 'sh'
                    ],
                                          stdin=p1.stdout,
                                          stdout=subprocess.PIPE)
                    _, err = p2.communicate()
                    if err:
                        log.error("Edit of /etc/sudoers failed: %s", err)
    if need_ansible and ansible_hosts:
        log.info("Running ansible on %s", list(ansible_hosts))
        ansible_config = dict(hosts=list(ansible_hosts), )
        with ansible.CephLab(ctx, config=ansible_config):
            pass
Ejemplo n.º 2
0
def vm_setup(ctx, config):
    """
    Look for virtual machines and handle their initialization
    """
    all_tasks = [x.keys()[0] for x in ctx.config['tasks']]
    need_chef = False
    if 'chef' in all_tasks or 'kernel' in all_tasks:
        need_chef = True
    with parallel() as p:
        editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh')
        for rem in ctx.cluster.remotes.iterkeys():
            mname = rem.shortname
            if misc.is_vm(mname):
                r = rem.run(args=['test', '-e', '/ceph-qa-ready',],
                        stdout=StringIO(),
                        check_status=False,)
                if r.returncode != 0:
                    p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE)
                    p2 = subprocess.Popen(
                        [
                            'ssh',
                            '-o', 'StrictHostKeyChecking=no',
                            '-t', '-t',
                            str(rem),
                            'sudo',
                            'sh'
                        ],
                        stdin=p1.stdout, stdout=subprocess.PIPE
                    )
                    _, err = p2.communicate()
                    if err:
                        log.info("Edit of /etc/sudoers failed: %s", err)
                    if need_chef:
                        p.spawn(_download_and_run_chef, rem)
Ejemplo n.º 3
0
def connect(ctx, config):
    log.info("Opening connections...")
    from ..orchestra import connection, remote
    from ..orchestra import cluster

    remotes = []
    machs = []
    for name in ctx.config["targets"].iterkeys():
        machs.append(name)
    for t, key in ctx.config["targets"].iteritems():
        log.debug("connecting to %s", t)
        try:
            if ctx.config["sshkeys"] == "ignore":
                key = None
        except (AttributeError, KeyError):
            pass
        for machine in ctx.config["targets"].iterkeys():
            if teuthology.is_vm(machine):
                key = None
                break
        remotes.append(
            remote.Remote(name=t, ssh=connection.connect(user_at_host=t, host_key=key, keep_alive=True), console=None)
        )
    ctx.cluster = cluster.Cluster()
    if "roles" in ctx.config:
        for rem, roles in zip(remotes, ctx.config["roles"]):
            assert all(isinstance(role, str) for role in roles), "Roles in config must be strings: %r" % roles
            ctx.cluster.add(rem, roles)
            log.info("roles: %s - %s" % (rem, roles))
    else:
        for rem in remotes:
            ctx.cluster.add(rem, rem.name)
Ejemplo n.º 4
0
def connect(ctx, config):
    """
    Open a connection to a remote host.
    """
    log.info('Opening connections...')
    from ..orchestra import remote
    from ..orchestra import cluster
    remotes = []
    machs = []
    for name in ctx.config['targets'].iterkeys():
        machs.append(name)
    for t, key in ctx.config['targets'].iteritems():
        log.debug('connecting to %s', t)
        try:
            if ctx.config['sshkeys'] == 'ignore':
                key = None
        except (AttributeError, KeyError):
            pass
        if key.startswith('ssh-rsa ') or key.startswith('ssh-dss '):
            if teuthology.is_vm(t):
                key = None
        remotes.append(
            remote.Remote(name=t, host_key=key, keep_alive=True, console=None))
    ctx.cluster = cluster.Cluster()
    if 'roles' in ctx.config:
        for rem, roles in zip(remotes, ctx.config['roles']):
            assert all(isinstance(role, str) for role in roles), \
                "Roles in config must be strings: %r" % roles
            ctx.cluster.add(rem, roles)
            log.info('roles: %s - %s' % (rem, roles))
    else:
        for rem in remotes:
            ctx.cluster.add(rem, rem.name)
Ejemplo n.º 5
0
def vm_setup(ctx, config):
    """
    Look for virtual machines and handle their initialization
    """
    with parallel() as p:
        editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh')
        for remote in ctx.cluster.remotes.iterkeys():
            mname = re.match(".*@([^\.]*)\.?.*", str(remote)).group(1)
            if teuthology.is_vm(mname):
                r = remote.run(
                    args=[
                        'test',
                        '-e',
                        '/ceph-qa-ready',
                    ],
                    stdout=StringIO(),
                    check_status=False,
                )
                if r.exitstatus != 0:
                    p1 = subprocess.Popen(['cat', editinfo],
                                          stdout=subprocess.PIPE)
                    p2 = subprocess.Popen(
                        ['ssh', '-t', '-t',
                         str(remote), 'sudo', 'sh'],
                        stdin=p1.stdout,
                        stdout=subprocess.PIPE)
                    _, err = p2.communicate()
                    if err:
                        log.info("Edit of /etc/sudoers failed: %s", err)
                    p.spawn(_handle_vm_init, remote)
Ejemplo n.º 6
0
def connect(ctx, config):
    """
    Open a connection to a remote host.
    """
    log.info('Opening connections...')
    from ..orchestra import remote
    from ..orchestra import cluster
    remotes = []
    machs = []
    for name in ctx.config['targets'].iterkeys():
        machs.append(name)
    for t, key in ctx.config['targets'].iteritems():
        log.debug('connecting to %s', t)
        try:
            if ctx.config['sshkeys'] == 'ignore':
                key = None
        except (AttributeError, KeyError):
            pass
        if key.startswith('ssh-rsa ') or key.startswith('ssh-dss '):
            if teuthology.is_vm(t):
                key = None
        remotes.append(
            remote.Remote(name=t, host_key=key, keep_alive=True, console=None))
    ctx.cluster = cluster.Cluster()
    if 'roles' in ctx.config:
        for rem, roles in zip(remotes, ctx.config['roles']):
            assert all(isinstance(role, str) for role in roles), \
                "Roles in config must be strings: %r" % roles
            ctx.cluster.add(rem, roles)
            log.info('roles: %s - %s' % (rem, roles))
    else:
        for rem in remotes:
            ctx.cluster.add(rem, rem.name)
Ejemplo n.º 7
0
def vm_setup(ctx, config):
    """
    Look for virtual machines and handle their initialization
    """
    all_tasks = [x.keys()[0] for x in ctx.config["tasks"]]
    need_ansible = False
    if "kernel" in all_tasks and "ansible.cephlab" not in all_tasks:
        need_ansible = True
    ansible_hosts = set()
    with parallel():
        editinfo = os.path.join(os.path.dirname(__file__), "edit_sudoers.sh")
        for rem in ctx.cluster.remotes.iterkeys():
            if misc.is_vm(rem.shortname):
                ansible_hosts.add(rem.shortname)
                r = rem.run(args=["test", "-e", "/ceph-qa-ready"], stdout=StringIO(), check_status=False)
                if r.returncode != 0:
                    p1 = subprocess.Popen(["cat", editinfo], stdout=subprocess.PIPE)
                    p2 = subprocess.Popen(
                        ["ssh", "-o", "StrictHostKeyChecking=no", "-t", "-t", str(rem), "sudo", "sh"],
                        stdin=p1.stdout,
                        stdout=subprocess.PIPE,
                    )
                    _, err = p2.communicate()
                    if err:
                        log.error("Edit of /etc/sudoers failed: %s", err)
    if need_ansible and ansible_hosts:
        log.info("Running ansible on %s", list(ansible_hosts))
        ansible_config = dict(hosts=list(ansible_hosts))
        with ansible.CephLab(ctx, config=ansible_config):
            pass
Ejemplo n.º 8
0
def getRemoteConsole(name, ipmiuser=None, ipmipass=None, ipmidomain=None,
                     logfile=None, timeout=20):
    """
    Return either VirtualConsole or PhysicalConsole depending on name.
    """
    if misc.is_vm(name):
        return console.VirtualConsole(name)
    return console.PhysicalConsole(
        name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
Ejemplo n.º 9
0
def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None,
                     timeout=20):
    """
    Return either VirtualConsole or PhysicalConsole depending on name.
    """
    if misc.is_vm(name):
        return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile,
                              timeout)
    return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile,
                           timeout)
Ejemplo n.º 10
0
def getRemoteConsole(name,
                     ipmiuser,
                     ipmipass,
                     ipmidomain,
                     logfile=None,
                     timeout=20):
    if misc.is_vm(name):
        return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile,
                              timeout)
    return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile,
                           timeout)
Ejemplo n.º 11
0
    def chcon(self, file_path, context):
        """
        Set the SELinux context of a given file.

        VMs and non-RPM-based hosts will skip this operation because ours
        currently have SELinux disabled.

        :param file_path: The path to the file
        :param context:   The SELinux context to be used
        """
        if self.os.package_type != 'rpm':
            return
        if misc.is_vm(self.shortname):
            return
        self.run(args="sudo chcon {con} {path}".format(
            con=context, path=file_path))
Ejemplo n.º 12
0
def vm_setup(ctx, config):
    """
    Look for virtual machines and handle their initialization
    """
    with parallel() as p:
        editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh')
        for remote in ctx.cluster.remotes.iterkeys():
            mname = re.match(".*@([^\.]*)\.?.*", str(remote)).group(1)
            if teuthology.is_vm(mname):
                r = remote.run(args=['test', '-e', '/ceph-qa-ready',],
                        stdout=StringIO(),
                        check_status=False,)
                if r.exitstatus != 0:
                    p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE)
                    p2 = subprocess.Popen(['ssh', '-t', '-t', str(remote), 'sudo', 'sh'], stdin=p1.stdout, stdout=subprocess.PIPE)
                    _, err = p2.communicate()
                    if err:
                        log.info("Edit of /etc/sudoers failed: %s", err)
                    p.spawn(_handle_vm_init, remote)
Ejemplo n.º 13
0
def vm_setup(ctx, config):
    """
    Look for virtual machines and handle their initialization
    """
    all_tasks = [x.keys()[0] for x in ctx.config['tasks']]
    need_chef = False
    if 'chef' in all_tasks or 'kernel' in all_tasks:
        need_chef = True
    with parallel() as p:
        editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh')
        for rem in ctx.cluster.remotes.iterkeys():
            mname = rem.shortname
            if misc.is_vm(mname):
                r = rem.run(
                    args=[
                        'test',
                        '-e',
                        '/ceph-qa-ready',
                    ],
                    stdout=StringIO(),
                    check_status=False,
                )
                if r.returncode != 0:
                    p1 = subprocess.Popen(['cat', editinfo],
                                          stdout=subprocess.PIPE)
                    p2 = subprocess.Popen([
                        'ssh', '-o', 'StrictHostKeyChecking=no', '-t', '-t',
                        str(rem), 'sudo', 'sh'
                    ],
                                          stdin=p1.stdout,
                                          stdout=subprocess.PIPE)
                    _, err = p2.communicate()
                    if err:
                        log.info("Edit of /etc/sudoers failed: %s", err)
                    if need_chef:
                        p.spawn(_download_and_run_chef, rem)
Ejemplo n.º 14
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    machine_types = teuthology.get_multi_machine_types(machine_type)
    how_many = config[0]

    while True:
        # make sure there are enough machines up
        machines = lock.list_locks()
        if machines is None:
            if ctx.block:
                log.warn('error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                assert 0, 'error listing machines'

        is_up = lambda machine: machine['up'] and machine['type'] in machine_types  # noqa
        num_up = len(filter(is_up, machines))
        assert num_up >= how_many, 'not enough machines are up'

        # make sure there are machines for non-automated jobs to run
        is_up_and_free = lambda machine: machine['up'] and machine['locked'] == 0 and machine['type'] in machine_types  # noqa
        up_and_free = filter(is_up_and_free, machines)
        num_free = len(up_and_free)
        if num_free < 6 and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more machines to be free (need %s see %s)...',
                    how_many,
                    num_free,
                )
                time.sleep(10)
                continue
            else:
                assert 0, 'not enough machines free'

        newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner,
                                      ctx.archive)
        if len(newly_locked) == how_many:
            vmlist = []
            for lmach in newly_locked:
                if teuthology.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keyscan_out = ''
                loopcount = 0
                while len(keyscan_out.splitlines()) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keyscan_out, current_locks = lock.keyscan_check(ctx,
                                                                    vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keyscan_out:
                                log.info('recreating: ' + guest)
                                lock.destroy_if_vm(ctx, 'ubuntu@' + guest)
                                lock.create_if_vm(ctx, 'ubuntu@' + guest)
                if lock.update_keys(ctx, keyscan_out, current_locks):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in newly_locked.iterkeys():
                    stats = lockstatus.get_status(ctx, dkey)
                    newscandict[dkey] = stats['sshpubkey']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = newly_locked
            # FIXME: Ugh.
            log.info('\n  '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'

        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        if ctx.config.get('unlock_on_failure', False) or \
           ctx.summary.get('success', False):
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner)
Ejemplo n.º 15
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    how_many = config[0]
    # We want to make sure there are always this many machines available
    to_reserve = 5

    while True:
        # get a candidate list of machines
        machines = lock.list_locks(machine_type=machine_type, up=True,
                                   locked=False, count=how_many + to_reserve)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) <= to_reserve and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more machines to be free (need %s see %s)...',
                    how_many,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, 'not enough machines free'

        newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner,
                                      ctx.archive)
        if not newly_locked and not isinstance(newly_locked, list):
            raise RuntimeError('Invalid parameters specified')
        if len(newly_locked) == how_many:
            vmlist = []
            for lmach in newly_locked:
                if misc.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = lock.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if lock.do_update_keys(keys_dict):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in newly_locked.iterkeys():
                    stats = lockstatus.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = newly_locked
            # FIXME: Ugh.
            log.info('\n  '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'

        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        if ctx.config.get('unlock_on_failure', False) or \
           ctx.summary.get('success', False):
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner)
Ejemplo n.º 16
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    how_many = config[0]
    # We want to make sure there are always this many machines available
    to_reserve = teuth_config.reserve_machines
    assert isinstance(to_reserve, int), 'reserve_machines must be integer'
    assert (to_reserve >= 0), 'reserve_machines should >= 0'

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    while True:
        # get a candidate list of machines
        machines = lock.list_locks(machine_type=machine_type, up=True,
                                   locked=False, count=how_many + to_reserve)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < to_reserve + how_many and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more machines to be free (need %s + %s, have %s)...',
                    to_reserve,
                    how_many,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (to_reserve, how_many, len(machines)))

        newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner,
                                      ctx.archive, os_type, os_version, arch)
        if not newly_locked and not isinstance(newly_locked, list):
            raise RuntimeError('Invalid parameters specified')
        if len(newly_locked) == how_many:
            vmlist = []
            for lmach in newly_locked:
                if misc.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = lock.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if lock.do_update_keys(keys_dict):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in newly_locked.iterkeys():
                    stats = lockstatus.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = newly_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'],
                default_flow_style=False
            ).splitlines()
            log.info('\n  '.join(['Locked targets:', ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'
        else:
            how_many = how_many - len(newly_locked)
            assert how_many > 0, "lock_machines: how_many counter went" \
                                 "negative, this shouldn't happen"

        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = (
            ctx.config.get('unlock_on_failure', False)
            and not ctx.config.get('nuke-on-error', False)
        )
        if get_status(ctx.summary) == 'pass' or unlock_on_failure:
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
Ejemplo n.º 17
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    total_requested = config[0]
    # We want to make sure there are always this many machines available
    reserved = teuth_config.reserve_machines
    assert isinstance(reserved, int), 'reserve_machines must be integer'
    assert (reserved >= 0), 'reserve_machines should >= 0'

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    all_locked = dict()
    requested = total_requested
    while True:
        # get a candidate list of machines
        machines = lock.list_locks(machine_type=machine_type, up=True,
                                   locked=False, count=requested + reserved)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more %s machines to be free (need %s + %s, have %s)...',
                    machine_type,
                    reserved,
                    requested,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (reserved, requested, len(machines)))

        newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner,
                                      ctx.archive, os_type, os_version, arch)
        all_locked.update(newly_locked)
        log.info(
            '{newly_locked} {mtype} machines locked this try, '
            '{total_locked}/{total_requested} locked so far'.format(
                newly_locked=len(newly_locked),
                mtype=machine_type,
                total_locked=len(all_locked),
                total_requested=total_requested,
            )
        )
        if len(all_locked) == total_requested:
            vmlist = []
            for lmach in all_locked:
                if misc.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = misc.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if lock.do_update_keys(keys_dict):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in all_locked.iterkeys():
                    stats = lockstatus.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = all_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'],
                default_flow_style=False
            ).splitlines()
            log.info('\n  '.join(['Locked targets:', ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'
        else:
            requested = requested - len(newly_locked)
            assert requested > 0, "lock_machines: requested counter went" \
                                  "negative, this shouldn't happen"

        log.info(
            "{total} machines locked ({new} new); need {more} more".format(
                total=len(all_locked), new=len(newly_locked), more=requested)
        )
        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = (
            ctx.config.get('unlock_on_failure', False)
            and not ctx.config.get('nuke-on-error', False)
        )
        if get_status(ctx.summary) == 'pass' or unlock_on_failure:
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
Ejemplo n.º 18
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get("arch")
    log.info("Locking machines...")
    assert isinstance(config[0], int), "config[0] must be an integer"
    machine_type = config[1]
    total_requested = config[0]
    # We want to make sure there are always this many machines available
    reserved = teuth_config.reserve_machines
    assert isinstance(reserved, int), "reserve_machines must be integer"
    assert reserved >= 0, "reserve_machines should >= 0"

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status="waiting"))

    all_locked = dict()
    requested = total_requested
    while True:
        # get a candidate list of machines
        machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved)
        if machines is None:
            if ctx.block:
                log.error("Error listing machines, trying again")
                time.sleep(20)
                continue
            else:
                raise RuntimeError("Error listing machines")

        # make sure there are machines for non-automated jobs to run
        if len(machines) < reserved + requested and ctx.owner.startswith("scheduled"):
            if ctx.block:
                log.info(
                    "waiting for more %s machines to be free (need %s + %s, have %s)...",
                    machine_type,
                    reserved,
                    requested,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, "not enough machines free; need %s + %s, have %s" % (reserved, requested, len(machines))

        newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch)
        all_locked.update(newly_locked)
        log.info(
            "{newly_locked} {mtype} machines locked this try, "
            "{total_locked}/{total_requested} locked so far".format(
                newly_locked=len(newly_locked),
                mtype=machine_type,
                total_locked=len(all_locked),
                total_requested=total_requested,
            )
        )
        if len(all_locked) == total_requested:
            vmlist = []
            for lmach in all_locked:
                if misc.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info("Waiting for virtual machines to come up")
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = misc.ssh_keyscan(vmlist)
                    log.info("virtual machine is still unavailable")
                    if loopcount == 40:
                        loopcount = 0
                        log.info("virtual machine(s) still not up, " + "recreating unresponsive ones.")
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info("recreating: " + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if lock.do_update_keys(keys_dict):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in all_locked.iterkeys():
                    stats = lockstatus.get_status(dkey)
                    newscandict[dkey] = stats["ssh_pub_key"]
                ctx.config["targets"] = newscandict
            else:
                ctx.config["targets"] = all_locked
            locked_targets = yaml.safe_dump(ctx.config["targets"], default_flow_style=False).splitlines()
            log.info("\n  ".join(["Locked targets:"] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status="running"))
            break
        elif not ctx.block:
            assert 0, "not enough machines are available"
        else:
            requested = requested - len(newly_locked)
            assert requested > 0, "lock_machines: requested counter went" "negative, this shouldn't happen"

        log.info(
            "{total} machines locked ({new} new); need {more} more".format(
                total=len(all_locked), new=len(newly_locked), more=requested
            )
        )
        log.warn("Could not lock enough machines, waiting...")
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = ctx.config.get("unlock_on_failure", False) and not ctx.config.get("nuke-on-error", False)
        if get_status(ctx.summary) == "pass" or unlock_on_failure:
            log.info("Unlocking machines...")
            for machine in ctx.config["targets"].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
Ejemplo n.º 19
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    machine_types = teuthology.get_multi_machine_types(machine_type)
    how_many = config[0]

    while True:
        # make sure there are enough machines up
        machines = lock.list_locks()
        if machines is None:
            if ctx.block:
                log.warn('error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                assert 0, 'error listing machines'

        is_up = lambda machine: machine['up'] and machine['type'] in machine_types  # noqa
        num_up = len(filter(is_up, machines))
        assert num_up >= how_many, 'not enough machines are up'

        # make sure there are machines for non-automated jobs to run
        is_up_and_free = lambda machine: machine['up'] and machine['locked'] == 0 and machine['type'] in machine_types  # noqa
        up_and_free = filter(is_up_and_free, machines)
        num_free = len(up_and_free)
        if num_free < 6 and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more machines to be free (need %s see %s)...',
                    how_many,
                    num_free,
                )
                time.sleep(10)
                continue
            else:
                assert 0, 'not enough machines free'

        newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner,
                                      ctx.archive)
        if len(newly_locked) == how_many:
            vmlist = []
            for lmach in newly_locked:
                if teuthology.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keyscan_out = ''
                loopcount = 0
                while len(keyscan_out.splitlines()) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keyscan_out, current_locks = lock.keyscan_check(ctx,
                                                                    vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keyscan_out:
                                log.info('recreating: ' + guest)
                                lock.destroy_if_vm(ctx, 'ubuntu@' + guest)
                                lock.create_if_vm(ctx, 'ubuntu@' + guest)
                if lock.update_keys(ctx, keyscan_out, current_locks):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in newly_locked.iterkeys():
                    stats = lockstatus.get_status(ctx, dkey)
                    newscandict[dkey] = stats['sshpubkey']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = newly_locked
            # FIXME: Ugh.
            log.info('\n  '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'

        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        if ctx.config.get('unlock_on_failure', False) or \
           ctx.summary.get('success', False):
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner)
Ejemplo n.º 20
0
def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = '{adir}/syslog'.format(adir=archive_dir)
    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir', '-p', '-m0755', '--',
                log_dir,
                ],
            wait=False,
            )
        )

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir)
    misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir)
    conf_lines = [
        'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log),
        '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format(
            misc_log=misc_log),
    ]
    conf_fp = StringIO('\n'.join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            # Exclude downburst VMs for now; they have SELinux disabled
            if rem.os.package_type == 'rpm' and not misc.is_vm(rem.shortname):
                log_context = 'system_u:object_r:var_log_t:s0'
                for log_path in (kern_log, misc_log):
                    rem.run(
                        args="touch {log} && sudo chcon {con} {log}".format(
                            log=log_path, con=log_context),
                    )
            misc.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
                )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                    ],
                wait=False,
                ),
            )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', rem.name)
            r = rem.run(
                args=[
                    'egrep', '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep', '-v', 'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep', '-v', 'lockdep is turned off',
                    run.Raw('|'),
                    'grep', '-v', 'trying to register non-static key',
                    run.Raw('|'),
                    'grep', '-v', 'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep', '-v', 'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep', '-v', 'BUG: bad unlock balance detected', # #6097
                    run.Raw('|'),
                    'grep', '-v', 'inconsistent lock state', # FIXME see #2523
                    run.Raw('|'),
                    'grep', '-v', '*** DEADLOCK ***', # part of lockdep output
                    run.Raw('|'),
                    'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147
                    run.Raw('|'),
                    'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run',
                    run.Raw('|'),
                    'grep', '-v', 'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'head', '-n', '1',
                    ],
                stdout=StringIO(),
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', rem.name, stdout)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                    ],
                wait=False,
                ),
            )
Ejemplo n.º 21
0
def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
    if misc.is_vm(name):
        return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
    return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
Ejemplo n.º 22
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    how_many = config[0]
    # We want to make sure there are always this many machines available
    to_reserve = 5

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    while True:
        # get a candidate list of machines
        machines = lock.list_locks(machine_type=machine_type,
                                   up=True,
                                   locked=False,
                                   count=how_many + to_reserve)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < to_reserve + how_many and ctx.owner.startswith(
                'scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more machines to be free (need %s + %s, have %s)...',
                    to_reserve,
                    how_many,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (to_reserve, how_many, len(machines)))

        newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner,
                                      ctx.archive, os_type, os_version, arch)
        if not newly_locked and not isinstance(newly_locked, list):
            raise RuntimeError('Invalid parameters specified')
        if len(newly_locked) == how_many:
            vmlist = []
            for lmach in newly_locked:
                if misc.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = lock.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if lock.do_update_keys(keys_dict):
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in newly_locked.iterkeys():
                    stats = lockstatus.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = newly_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'], default_flow_style=False).splitlines()
            log.info('\n  '.join([
                'Locked targets:',
            ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'

        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        if ctx.config.get('unlock_on_failure', False) or \
                get_status(ctx.summary) == 'pass':
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)