def check_lock(ctx, config, check_up=True): """ Check lock status of remote machines. """ if not teuth_config.lock_server or ctx.config.get('check-locks') is False: log.info('Lock checking disabled.') return log.info('Checking locks...') for machine in ctx.config['targets'].iterkeys(): status = lockstatus.get_status(machine) log.debug('machine status is %s', repr(status)) assert status is not None, \ 'could not read lock status for {name}'.format(name=machine) if check_up: assert status['up'], 'machine {name} is marked down'.format( name=machine ) assert status['locked'], \ 'machine {name} is not locked'.format(name=machine) assert status['locked_by'] == ctx.owner, \ 'machine {name} is locked by {user}, not {owner}'.format( name=machine, user=status['locked_by'], owner=ctx.owner, )
def machine_type(self): if not getattr(self, '_machine_type', None): remote_info = ls.get_status(self.hostname) if not remote_info: return None self._machine_type = remote_info.get("machine_type", None) return self._machine_type
def get_testdir(ctx): if 'test_path' in ctx.teuthology_config: return ctx.teuthology_config['test_path'] basedir = ctx.teuthology_config.get('base_test_dir', '/home/ubuntu/cephtest') global global_jobid global checked_jobid # check if a jobid exists in the machine status for all our targets # and if its the same jobid, use that as the subdir for the test if not checked_jobid: jobids = {} for machine in ctx.config['targets'].iterkeys(): status = lockstatus.get_status(ctx, machine) if status is None or 'description' not in status or status['description'] is None: continue jid = status['description'].split('/')[-1] if jid is None or jid == 'None': continue jobids[jid] = 1 if len(jobids) > 1: break if len(jobids) == 1: # same job id on all machines, use that as the test subdir (jobid,) = jobids.iterkeys() if jobid is not None: global_jobid = jobid log.debug('setting my jobid to {jid}'.format(jid=global_jobid)) checked_jobid = True # the subdir is chosen using the priority: # 1. jobid chosen by the teuthology beanstalk queue # 2. run name specified by teuthology schedule # 3. user@timestamp if global_jobid is not None: log.debug('with jobid basedir: {b}'.format(b=global_jobid)) return '{basedir}/{jobid}'.format( basedir=basedir, jobid=global_jobid, ) elif hasattr(ctx, 'name') and ctx.name: log.debug('with name basedir: {b}'.format(b=basedir)) # we need a short string to keep the path short import re m = re.match(r"(.*)-(.*)-(.*)-(.*)_(.*)-(.*)-(.*)-(.*)-(.*)", ctx.name) (u, y, m, d, hms, s, c, k, f) = m.groups() short = u[0:2] + y[2:4] + m[0:2] + d[0:2] + hms[0:2] + hms[3:5] + s[0] + c[0] + k[0] + f[0] return '{basedir}/{rundir}'.format( basedir=basedir, rundir=short, ) else: log.debug('basedir: {b}'.format(b=basedir)) return '{basedir}/{user}{stamp}'.format( basedir=basedir, user=get_user()[0:2], stamp=stamp)
def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): self.shortname = getShortName(name) status_info = ls.get_status('', self.shortname) try: phys_host = status_info['vpshost'] except TypeError: return self.connection = libvirt.open(phys_host) for i in self.connection.listDomainsID(): d = self.connection.lookupByID(i) if d.name() == self.shortname: self.vm_domain = d break return
def check_lock(ctx, config): if ctx.config.get("check-locks") == False: log.info("Lock checking disabled.") return log.info("Checking locks...") for machine in ctx.config["targets"].iterkeys(): status = lockstatus.get_status(ctx, machine) log.debug("machine status is %s", repr(status)) assert status is not None, "could not read lock status for {name}".format(name=machine) assert status["up"], "machine {name} is marked down".format(name=machine) assert status["locked"], "machine {name} is not locked".format(name=machine) assert status["locked_by"] == ctx.owner, "machine {name} is locked by {user}, not {owner}".format( name=machine, user=status["locked_by"], owner=ctx.owner )
def create_if_vm(ctx, machine_name): status_info = ls.get_status(ctx, machine_name) phys_host = status_info['vpshost'] if not phys_host: return False try: vm_type = ctx.vm_type except AttributeError: vm_type = 'ubuntu' createMe = decanonicalize_hostname(machine_name) with tempfile.NamedTemporaryFile() as tmp: try: lcnfg = ctx.config['downburst'] except KeyError: lcnfg = {} file_info = {} file_info['disk-size'] = lcnfg.get('disk-size', '30G') file_info['ram'] = lcnfg.get('ram', '1.9G') file_info['cpus'] = lcnfg.get('cpus', 1) file_info['networks'] = lcnfg.get('networks', [{'source' : 'front', 'mac' : status_info['mac']}]) file_info['distro'] = lcnfg.get('distro', vm_type.lower()) file_info['additional-disks'] = lcnfg.get( 'additional-disks', 3) file_info['additional-disks-size'] = lcnfg.get( 'additional-disks-size', '200G') file_info['arch'] = lcnfg.get('arch', 'x86_64') file_out = {'downburst': file_info} yaml.safe_dump(file_out, tmp) metadata = "--meta-data=%s" % tmp.name dbrst = _get_downburst_exec() if not dbrst: log.info("Error: no downburst executable found") return False p = subprocess.Popen([dbrst, '-c', phys_host, 'create', metadata, createMe], stdout=subprocess.PIPE,stderr=subprocess.PIPE,) owt,err = p.communicate() if err: log.info("Downburst completed on %s: %s" % (machine_name,err)) else: log.info("%s created: %s" % (machine_name,owt)) return True
def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): if libvirt is None: raise RuntimeError("libvirt not found") self.shortname = getShortName(name) status_info = ls.get_status('', self.shortname) try: phys_host = status_info['vpshost'] except TypeError: return self.connection = libvirt.open(phys_host) for i in self.connection.listDomainsID(): d = self.connection.lookupByID(i) if d.name() == self.shortname: self.vm_domain = d break return
def __init__(self, name): if libvirt is None: raise RuntimeError("libvirt not found") self.shortname = remote.getShortName(name) status_info = ls.get_status(self.shortname) try: if status_info.get('is_vm', False): phys_host = status_info['vm_host']['name'].split('.')[0] except TypeError: return self.connection = libvirt.open(phys_host) for i in self.connection.listDomainsID(): d = self.connection.lookupByID(i) if d.name() == self.shortname: self.vm_domain = d break return
def filter_hosts(self): """ Exclude any non-RPM-based hosts, and any downburst VMs """ super(SELinux, self).filter_hosts() new_cluster = Cluster() for (remote, roles) in self.cluster.remotes.iteritems(): status_info = get_status(remote.name) if status_info and status_info.get('is_vm', False): msg = "Excluding {host}: VMs are not yet supported" log.info(msg.format(host=remote.shortname)) elif remote.os.package_type == 'rpm': new_cluster.add(remote, roles) else: msg = "Excluding {host}: OS '{os}' does not support SELinux" log.debug(msg.format(host=remote.shortname, os=remote.os.name)) self.cluster = new_cluster return self.cluster
def check_lock(ctx, config, check_up=True): """ Check lock status of remote machines. """ if not teuth_config.lock_server or ctx.config.get("check-locks") is False: log.info("Lock checking disabled.") return log.info("Checking locks...") for machine in ctx.config["targets"].iterkeys(): status = lockstatus.get_status(machine) log.debug("machine status is %s", repr(status)) assert status is not None, "could not read lock status for {name}".format(name=machine) if check_up: assert status["up"], "machine {name} is marked down".format(name=machine) assert status["locked"], "machine {name} is not locked".format(name=machine) assert status["locked_by"] == ctx.owner, "machine {name} is locked by {user}, not {owner}".format( name=machine, user=status["locked_by"], owner=ctx.owner )
def check_lock(ctx, config): if ctx.config.get('check-locks') == False: log.info('Lock checking disabled.') return log.info('Checking locks...') for machine in ctx.config['targets'].iterkeys(): status = lockstatus.get_status(ctx, machine) log.debug('machine status is %s', repr(status)) assert status is not None, \ 'could not read lock status for {name}'.format(name=machine) assert status['up'], 'machine {name} is marked down'.format(name=machine) assert status['locked'], \ 'machine {name} is not locked'.format(name=machine) assert status['locked_by'] == ctx.owner, \ 'machine {name} is locked by {user}, not {owner}'.format( name=machine, user=status['locked_by'], owner=ctx.owner, )
def check_lock(ctx, config): if ctx.config.get('check-locks') == False: log.info('Lock checking disabled.') return log.info('Checking locks...') for machine in ctx.config['targets'].iterkeys(): status = lockstatus.get_status(ctx, machine) log.debug('machine status is %s', repr(status)) assert status is not None, \ 'could not read lock status for {name}'.format(name=machine) assert status['up'], 'machine {name} is marked down'.format( name=machine) assert status['locked'], \ 'machine {name} is not locked'.format(name=machine) assert status['locked_by'] == ctx.owner, \ 'machine {name} is locked by {user}, not {owner}'.format( name=machine, user=status['locked_by'], owner=ctx.owner, )
def update_lock(ctx, name, description=None, status=None, sshpubkey=None): status_info = ls.get_status(ctx, name) phys_host = status_info['vpshost'] if phys_host: keyscan_out = '' while not keyscan_out: time.sleep(10) keyscan_out, _ = keyscan_check(ctx, [name]) updated = {} if description is not None: updated['desc'] = description if status is not None: updated['status'] = status if sshpubkey is not None: updated['sshpubkey'] = sshpubkey if updated: success, _, _ = ls.send_request('PUT', ls._lock_url(ctx) + '/' + name, body=urllib.urlencode(updated), headers={'Content-type': 'application/x-www-form-urlencoded'}) return success return True
def destroy_if_vm(ctx, machine_name): """ Return False only on vm downburst failures. """ status_info = ls.get_status(ctx, machine_name) phys_host = status_info['vpshost'] if not phys_host: return True destroyMe = decanonicalize_hostname(machine_name) dbrst = _get_downburst_exec() if not dbrst: log.info("Error: no downburst executable found") return False p = subprocess.Popen([dbrst, '-c', phys_host, 'destroy', destroyMe], stdout=subprocess.PIPE,stderr=subprocess.PIPE,) owt,err = p.communicate() if err: log.info("Error occurred while deleting %s" % destroyMe) return False else: log.info("%s destroyed: %s" % (machine_name,owt)) return True
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith( 'scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False).splitlines() log.info('\n '.join([ 'Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ get_status(ctx.summary) == 'pass': log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): log.info('Locking machines...') assert isinstance(config[0], int), 'config must be an integer' machine_type = config[1] config = config[0] while True: # make sure there are enough machines up machines = lock.list_locks(ctx) if machines is None: if ctx.block: log.warn('error listing machines, trying again') time.sleep(20) continue else: assert 0, 'error listing machines' num_up = len( filter( lambda machine: machine['up'] and machine['type'] == machine_type, machines)) assert num_up >= config, 'not enough machines are up' # make sure there are machines for non-automated jobs to run num_free = len( filter( lambda machine: machine['up'] and machine['locked'] == 0 and machine['type'] == machine_type, machines)) if num_free < 6 and ctx.owner.startswith('scheduled'): if ctx.block: log.info('waiting for more machines to be free...') time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, config, machine_type, ctx.owner, ctx.archive) if len(newly_locked) == config: vmlist = [] for lmach in newly_locked: if lock.create_if_vm(ctx, lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keyscan_out = '' loopcount = 0 while len(keyscan_out.splitlines()) != len(vmlist): loopcount += 1 time.sleep(10) keyscan_out, current_locks = lock.keyscan_check( ctx, vmlist) log.info('virtual machine is stil unavailable') if loopcount == 40: loopcount = 0 log.info( 'virtual machine(s) still not up, recreating unresponsive ones.' ) for guest in vmlist: if guest not in keyscan_out: log.info('recreating: ' + guest) lock.destroy_if_vm(ctx, 'ubuntu@' + guest) lock.create_if_vm(ctx, 'ubuntu@' + guest) if lock.update_keys(ctx, keyscan_out, current_locks): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(ctx, dkey) newscandict[dkey] = stats['sshpubkey'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked log.info('\n '.join([ 'Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock(ctx, machine, ctx.owner)
def lock_machines(ctx, config): log.info("Locking machines...") assert isinstance(config[0], int), "config[0] must be an integer" machine_type = config[1] how_many = config[0] while True: # make sure there are enough machines up machines = lock.list_locks() if machines is None: if ctx.block: log.warn("error listing machines, trying again") time.sleep(20) continue else: assert 0, "error listing machines" is_up = lambda machine: machine["up"] and machine["type"] == machine_type # noqa num_up = len(filter(is_up, machines)) assert num_up >= how_many, "not enough machines are up" # make sure there are machines for non-automated jobs to run is_up_and_free = ( lambda machine: machine["up"] and machine["locked"] == 0 and machine["type"] == machine_type ) # noqa up_and_free = filter(is_up_and_free, machines) num_free = len(up_and_free) if num_free < 6 and ctx.owner.startswith("scheduled"): if ctx.block: log.info("waiting for more machines to be free (need %s see %s)...", how_many, num_free) time.sleep(10) continue else: assert 0, "not enough machines free" newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive) if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if lock.create_if_vm(ctx, lmach): vmlist.append(lmach) if vmlist: log.info("Waiting for virtual machines to come up") keyscan_out = "" loopcount = 0 while len(keyscan_out.splitlines()) != len(vmlist): loopcount += 1 time.sleep(10) keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist) log.info("virtual machine is stil unavailable") if loopcount == 40: loopcount = 0 log.info("virtual machine(s) still not up, " + "recreating unresponsive ones.") for guest in vmlist: if guest not in keyscan_out: log.info("recreating: " + guest) lock.destroy_if_vm(ctx, "ubuntu@" + guest) lock.create_if_vm(ctx, "ubuntu@" + guest) if lock.update_keys(ctx, keyscan_out, current_locks): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(ctx, dkey) newscandict[dkey] = stats["sshpubkey"] ctx.config["targets"] = newscandict else: ctx.config["targets"] = newly_locked # FIXME: Ugh. log.info( "\n ".join( ["Locked targets:"] + yaml.safe_dump(ctx.config["targets"], default_flow_style=False).splitlines() ) ) break elif not ctx.block: assert 0, "not enough machines are available" log.warn("Could not lock enough machines, waiting...") time.sleep(10) try: yield finally: if ctx.summary.get("success", False): log.info("Unlocking machines...") for machine in ctx.config["targets"].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get("arch") log.info("Locking machines...") assert isinstance(config[0], int), "config[0] must be an integer" machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), "reserve_machines must be integer" assert reserved >= 0, "reserve_machines should >= 0" # change the status during the locking process report.try_push_job_info(ctx.config, dict(status="waiting")) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error("Error listing machines, trying again") time.sleep(20) continue else: raise RuntimeError("Error listing machines") # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith("scheduled"): if ctx.block: log.info( "waiting for more %s machines to be free (need %s + %s, have %s)...", machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, "not enough machines free; need %s + %s, have %s" % (reserved, requested, len(machines)) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( "{newly_locked} {mtype} machines locked this try, " "{total_locked}/{total_requested} locked so far".format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info("Waiting for virtual machines to come up") keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info("virtual machine is still unavailable") if loopcount == 40: loopcount = 0 log.info("virtual machine(s) still not up, " + "recreating unresponsive ones.") for guest in vmlist: if guest not in keys_dict.keys(): log.info("recreating: " + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats["ssh_pub_key"] ctx.config["targets"] = newscandict else: ctx.config["targets"] = all_locked locked_targets = yaml.safe_dump(ctx.config["targets"], default_flow_style=False).splitlines() log.info("\n ".join(["Locked targets:"] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status="running")) break elif not ctx.block: assert 0, "not enough machines are available" else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested ) ) log.warn("Could not lock enough machines, waiting...") time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ctx.config.get("unlock_on_failure", False) and not ctx.config.get("nuke-on-error", False) if get_status(ctx.summary) == "pass" or unlock_on_failure: log.info("Unlocking machines...") for machine in ctx.config["targets"].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): log.info('Locking machines...') assert isinstance(config[0], int), 'config must be an integer' machine_type = config[1] config = config[0] while True: # make sure there are enough machines up machines = lock.list_locks(ctx) if machines is None: if ctx.block: log.warn('error listing machines, trying again') time.sleep(20) continue else: assert 0, 'error listing machines' num_up = len(filter(lambda machine: machine['up'] and machine['type'] == machine_type, machines)) assert num_up >= config, 'not enough machines are up' # make sure there are machines for non-automated jobs to run num_free = len(filter( lambda machine: machine['up'] and machine['locked'] == 0 and machine['type'] == machine_type, machines )) if num_free < 6 and ctx.owner.startswith('scheduled'): if ctx.block: log.info('waiting for more machines to be free...') time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, config, machine_type, ctx.owner, ctx.archive) if len(newly_locked) == config: vmlist = [] for lmach in newly_locked: if lock.create_if_vm(ctx,lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keyscan_out = '' while len(keyscan_out.splitlines()) != len(vmlist): time.sleep(10) keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist) log.info('virtual machine is stil unavailable') if lock.update_keys(ctx, keyscan_out, current_locks): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(ctx, dkey) newscandict[dkey] = stats['sshpubkey'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock(ctx, machine, ctx.owner)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] machine_types = teuthology.get_multi_machine_types(machine_type) how_many = config[0] while True: # make sure there are enough machines up machines = lock.list_locks() if machines is None: if ctx.block: log.warn('error listing machines, trying again') time.sleep(20) continue else: assert 0, 'error listing machines' is_up = lambda machine: machine['up'] and machine['type'] in machine_types # noqa num_up = len(filter(is_up, machines)) assert num_up >= how_many, 'not enough machines are up' # make sure there are machines for non-automated jobs to run is_up_and_free = lambda machine: machine['up'] and machine['locked'] == 0 and machine['type'] in machine_types # noqa up_and_free = filter(is_up_and_free, machines) num_free = len(up_and_free) if num_free < 6 and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s see %s)...', how_many, num_free, ) time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive) if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if teuthology.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keyscan_out = '' loopcount = 0 while len(keyscan_out.splitlines()) != len(vmlist): loopcount += 1 time.sleep(10) keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keyscan_out: log.info('recreating: ' + guest) lock.destroy_if_vm(ctx, 'ubuntu@' + guest) lock.create_if_vm(ctx, 'ubuntu@' + guest) if lock.update_keys(ctx, keyscan_out, current_locks): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(ctx, dkey) newscandict[dkey] = stats['sshpubkey'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked # FIXME: Ugh. log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = teuth_config.reserve_machines assert isinstance(to_reserve, int), 'reserve_machines must be integer' assert (to_reserve >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: how_many = how_many - len(newly_locked) assert how_many > 0, "lock_machines: how_many counter went" \ "negative, this shouldn't happen" log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) <= to_reserve and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s see %s)...', how_many, len(machines), ) time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked # FIXME: Ugh. log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
assert ctx.owner is None, \ '--all and --owner are mutually exclusive' assert not machines, \ '--all and listing specific machines are incompatible' if ctx.num_to_lock: assert ctx.machine_type, \ 'must specify machine type to lock' if ctx.brief: assert ctx.list, '--brief only applies to --list' if ctx.list or ctx.list_targets: assert ctx.desc is None, '--desc does nothing with --list' if machines: statuses = [ls.get_status(ctx, machine) for machine in machines] else: statuses = list_locks(ctx) vmachines = [] for vmachine in statuses: if vmachine['vpshost']: if vmachine['locked']: vmachines.append(vmachine['name']) if vmachines: # Avoid ssh-keyscans for everybody when listing all machines # Listing specific machines will update the keys. if machines: scan_for_locks(ctx, vmachines) statuses = [ls.get_status(ctx, machine) for machine in machines] else: