def is_vm(name=None, status=None): if status is None: if name is None: raise ValueError("Must provide either name or status, or both") name = misc.canonicalize_hostname(name) status = get_status(name) return status.get('is_vm', False)
def connect(ctx, config): """ Open a connection to a remote host. """ log.info('Opening connections...') remotes = [] machs = [] for name in ctx.config['targets'].iterkeys(): machs.append(name) for t, key in ctx.config['targets'].iteritems(): t = misc.canonicalize_hostname(t) log.debug('connecting to %s', t) try: if ctx.config['sshkeys'] == 'ignore': key = None except (AttributeError, KeyError): pass remotes.append( remote.Remote(name=t, host_key=key, keep_alive=True, console=None)) ctx.cluster = cluster.Cluster() if 'roles' in ctx.config: for rem, roles in zip(remotes, ctx.config['roles']): assert all(isinstance(role, str) for role in roles), \ "Roles in config must be strings: %r" % roles ctx.cluster.add(rem, roles) log.info('roles: %s - %s' % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def unlock_one(ctx, name, user, description=None): name = misc.canonicalize_hostname(name, user=None) if not teuthology.provision.destroy_if_vm(ctx, name, user, description): log.error('destroy failed for %s', name) return False request = dict(name=name, locked=False, locked_by=user, description=description) uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '') with safe_while( sleep=1, increment=0.5, action="unlock %s" % name) as proceed: while proceed(): try: response = requests.put(uri, json.dumps(request)) break # Work around https://github.com/kennethreitz/requests/issues/2364 except requests.ConnectionError as e: log.warn("Saw %s while unlocking; retrying...", str(e)) success = response.ok if success: log.info('unlocked %s', name) else: try: reason = response.json().get('message') except ValueError: reason = str(response.status_code) log.error('failed to unlock {node}. reason: {reason}'.format( node=name, reason=reason)) return success
def add_remotes(ctx, config): """ Create a ctx.cluster object populated with remotes mapped to roles """ ctx.cluster = cluster.Cluster() # Allow jobs to run without using nodes, for self-testing if 'roles' not in ctx.config and 'targets' not in ctx.config: return remotes = [] machs = [] for name in ctx.config['targets'].iterkeys(): machs.append(name) for t, key in ctx.config['targets'].iteritems(): t = misc.canonicalize_hostname(t) try: if ctx.config['sshkeys'] == 'ignore': key = None except (AttributeError, KeyError): pass rem = remote.Remote(name=t, host_key=key, keep_alive=True) remotes.append(rem) if 'roles' in ctx.config: for rem, roles in zip(remotes, ctx.config['roles']): assert all(isinstance(role, str) for role in roles), \ "Roles in config must be strings: %r" % roles ctx.cluster.add(rem, roles) log.info('roles: %s - %s' % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def add_remotes(ctx, config): """ Create a ctx.cluster object populated with remotes mapped to roles """ remotes = [] machs = [] for name in ctx.config["targets"].iterkeys(): machs.append(name) for t, key in ctx.config["targets"].iteritems(): t = misc.canonicalize_hostname(t) try: if ctx.config["sshkeys"] == "ignore": key = None except (AttributeError, KeyError): pass rem = remote.Remote(name=t, host_key=key, keep_alive=True) remotes.append(rem) ctx.cluster = cluster.Cluster() if "roles" in ctx.config: for rem, roles in zip(remotes, ctx.config["roles"]): assert all(isinstance(role, str) for role in roles), "Roles in config must be strings: %r" % roles ctx.cluster.add(rem, roles) log.info("roles: %s - %s" % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def unlock_one(ctx, name, user, description=None): name = misc.canonicalize_hostname(name, user=None) if not teuthology.provision.destroy_if_vm(ctx, name, user, description): log.error('destroy failed for %s', name) return False request = dict(name=name, locked=False, locked_by=user, description=description) uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '') with safe_while(sleep=1, increment=0.5, action="unlock %s" % name) as proceed: while proceed(): try: response = requests.put(uri, json.dumps(request)) break # Work around https://github.com/kennethreitz/requests/issues/2364 except requests.ConnectionError as e: log.warn("Saw %s while unlocking; retrying...", str(e)) success = response.ok if success: log.info('unlocked %s', name) else: try: reason = response.json().get('message') except ValueError: reason = str(response.status_code) log.error('failed to unlock {node}. reason: {reason}'.format( node=name, reason=reason)) return success
def add_remotes(ctx, config): """ Create a ctx.cluster object populated with remotes mapped to roles """ ctx.cluster = cluster.Cluster() # Allow jobs to run without using nodes, for self-testing if 'roles' not in ctx.config and 'targets' not in ctx.config: return remotes = [] machs = [] for name in ctx.config['targets'].keys(): machs.append(name) for t, key in ctx.config['targets'].items(): t = misc.canonicalize_hostname(t) try: if ctx.config['sshkeys'] == 'ignore': key = None except (AttributeError, KeyError): pass rem = remote.Remote(name=t, host_key=key, keep_alive=True) remotes.append(rem) if 'roles' in ctx.config: for rem, roles in zip(remotes, ctx.config['roles']): assert all(isinstance(role, str) for role in roles), \ "Roles in config must be strings: %r" % roles ctx.cluster.add(rem, roles) log.info('roles: %s - %s' % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def stale_openstack_instances(ctx, instances, locked_nodes): for (instance_id, instance) in instances.items(): i = OpenStackInstance(instance_id) if not i.exists(): log.debug( "stale-openstack: {instance} disappeared, ignored".format( instance=instance_id)) continue if (i.get_created() > config['max_job_time'] + OPENSTACK_DELAY): log.info("stale-openstack: destroying instance {instance}" " because it was created {created} seconds ago" " which is older than" " max_job_time {max_job_time} + {delay}".format( instance=i['name'], created=i.get_created(), max_job_time=config['max_job_time'], delay=OPENSTACK_DELAY)) if not ctx.dry_run: i.destroy() continue name = canonicalize_hostname(i['name'], user=None) if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes: log.info("stale-openstack: destroying instance {instance}" " because it was created {created} seconds ago" " is older than {delay}s and it is not locked".format( instance=i['name'], created=i.get_created(), delay=OPENSTACK_DELAY)) if not ctx.dry_run: i.destroy() continue log.debug("stale-openstack: instance " + i['name'] + " OK")
def __init__(self, name, os_type, os_version): self.remote = teuthology.orchestra.remote.Remote( misc.canonicalize_hostname(name)) self.name = self.remote.hostname self.shortname = self.remote.shortname self.os_type = os_type self.os_version = os_version self.log = log.getChild(self.shortname)
def get_status(name): name = misc.canonicalize_hostname(name, user=None) uri = os.path.join(config.lock_server, 'nodes', name, '') response = requests.get(uri) success = response.ok if success: return response.json() log.warning( "Failed to query lock server for status of {name}".format(name=name)) return None
def get_status(name): name = misc.canonicalize_hostname(name, user=None) uri = os.path.join(config.lock_server, 'nodes', name, '') with safe_while(sleep=1, increment=0.5, action=f'get_status {name}') as proceed: while proceed(): response = requests.get(uri) if response.ok: return response.json() log.warning( "Failed to query lock server for status of {name}".format(name=name)) return dict()
def __init__( self, provider, name, os_type=None, os_version=None, conf=None, user='******', ): if isinstance(provider, basestring): provider = teuthology.provision.cloud.get_provider(provider) self.provider = provider self.name = decanonicalize_hostname(name) self.hostname = canonicalize_hostname(name, user=None) self.os_type = os_type self.os_version = os_version self.user = user
def update_nodes(nodes, reset_os=False): for node in nodes: remote = teuthology.orchestra.remote.Remote( canonicalize_hostname(node)) if reset_os: log.info("Updating [%s]: reset os type and version on server", node) inventory_info = dict() inventory_info['os_type'] = '' inventory_info['os_version'] = '' inventory_info['name'] = remote.hostname else: log.info("Updating [%s]: set os type and version on server", node) inventory_info = remote.inventory_info update_inventory(inventory_info)
def get_statuses(machines): if machines: statuses = [] for machine in machines: machine = misc.canonicalize_hostname(machine) status = get_status(machine) if status: statuses.append(status) else: log.error("Lockserver doesn't know about machine: %s" % machine) else: statuses = list_locks() return statuses
def __init__(self, name, os_type, os_version=""): #for service should be a hostname, not a user@host split_uri = re.search(r'(\w*)@(.+)', canonicalize_hostname(name)) if split_uri is not None: self.name = split_uri.groups()[1] else: self.name = name self.os_type = os_type self.os_version = os_version if os_version: self.os_name = os_type + "-" + os_version else: self.os_name = os_type self.log = log.getChild(self.name)
def update_lock(name, description=None, status=None, ssh_pub_key=None): name = misc.canonicalize_hostname(name, user=None) updated = {} if description is not None: updated['description'] = description if status is not None: updated['up'] = (status == 'up') if ssh_pub_key is not None: updated['ssh_pub_key'] = ssh_pub_key if updated: uri = os.path.join(config.lock_server, 'nodes', name, '') response = requests.put(uri, json.dumps(updated)) return response.ok return True
def update_lock(name, description=None, status=None, ssh_pub_key=None): name = misc.canonicalize_hostname(name, user=None) updated = {} if description is not None: updated['description'] = description if status is not None: updated['up'] = (status == 'up') if ssh_pub_key is not None: updated['ssh_pub_key'] = ssh_pub_key if updated: uri = os.path.join(config.lock_server, 'nodes', name, '') response = requests.put( uri, json.dumps(updated)) return response.ok return True
def updatekeys(args): loglevel = logging.DEBUG if args['--verbose'] else logging.INFO logging.basicConfig(level=loglevel, ) all_ = args['--all'] machines = [] if args['<machine>']: machines = [ misc.canonicalize_hostname(m, user=None) for m in args['<machine>'] ] elif args['--targets']: targets = args['--targets'] with open(targets) as f: docs = yaml.safe_load_all(f) for doc in docs: machines = [n for n in doc.get('targets', dict()).iterkeys()] return keys.do_update_keys(machines, all_)[0]
def updatekeys(args): loglevel = logging.DEBUG if args['--verbose'] else logging.INFO logging.basicConfig( level=loglevel, ) all_ = args['--all'] machines = [] if args['<machine>']: machines = [misc.canonicalize_hostname(m, user=None) for m in args['<machine>']] elif args['--targets']: targets = args['--targets'] with file(targets) as f: docs = yaml.safe_load_all(f) for doc in docs: machines = [n for n in doc.get('targets', dict()).iterkeys()] return keys.do_update_keys(machines, all_)[0]
def lock_one(name, user=None, description=None): name = misc.canonicalize_hostname(name, user=None) if user is None: user = misc.get_user() request = dict(name=name, locked=True, locked_by=user, description=description) uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '') response = requests.put(uri, json.dumps(request)) success = response.ok if success: log.debug('locked %s as %s', name, user) else: try: reason = response.json().get('message') except ValueError: reason = str(response.status_code) log.error('failed to lock {node}. reason: {reason}'.format( node=name, reason=reason)) return response
def unlock_many(names, user): fixed_names = [misc.canonicalize_hostname(name, user=None) for name in names] names = fixed_names uri = os.path.join(config.lock_server, 'nodes', 'unlock_many', '') data = dict( locked_by=user, names=names, ) response = requests.post( uri, data=json.dumps(data), headers={'content-type': 'application/json'}, ) if response.ok: log.debug("Unlocked: %s", ', '.join(names)) else: log.error("Failed to unlock: %s", ', '.join(names)) return response.ok
def connect(ctx, config): """ Open a connection to a remote host. """ log.info('Opening connections...') remotes = [] machs = [] for name in ctx.config['targets'].iterkeys(): machs.append(name) for t, key in ctx.config['targets'].iteritems(): t = misc.canonicalize_hostname(t) log.debug('connecting to %s', t) try: if ctx.config['sshkeys'] == 'ignore': key = None except (AttributeError, KeyError): pass remotes.append( remote.Remote(name=t, host_key=key, keep_alive=True, console=None)) ctx.cluster = cluster.Cluster() remotes2 = [] remotes3 = [] found = 1 for host in ctx.config['targets'].iterkeys(): remotes2.append(host) remotes3 = sorted_nicely (remotes2) if 'roles' in ctx.config: for rem, roles in zip(remotes3, ctx.config['roles']): assert all(isinstance(role, str) for role in roles), \ "Roles in config must be strings: %r" % roles for objs in remotes: if rem == objs.name: ctx.cluster.add(objs, roles) found = 0 break; if found == 1: log.error('role matching error %s' % rem) log.info('roles: %s - %s' % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def update_lock(name, description=None, status=None, ssh_pub_key=None): name = misc.canonicalize_hostname(name, user=None) updated = {} if description is not None: updated['description'] = description if status is not None: updated['up'] = (status == 'up') if ssh_pub_key is not None: updated['ssh_pub_key'] = ssh_pub_key if updated: uri = os.path.join(config.lock_server, 'nodes', name, '') inc = random.uniform(0, 1) with safe_while(sleep=1, increment=inc, action=f'update lock {name}') as proceed: while proceed(): response = requests.put(uri, json.dumps(updated)) if response.ok: return True return response.ok return True
def unlock_many(names, user): fixed_names = [ misc.canonicalize_hostname(name, user=None) for name in names ] names = fixed_names uri = os.path.join(config.lock_server, 'nodes', 'unlock_many', '') data = dict( locked_by=user, names=names, ) with safe_while(sleep=1, increment=0.5, action=f'unlock_many {names}') as proceed: while proceed(): response = requests.post( uri, data=json.dumps(data), headers={'content-type': 'application/json'}, ) if response.ok: log.debug("Unlocked: %s", ', '.join(names)) return True log.error("Failed to unlock: %s", ', '.join(names)) return False
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get("arch") log.info("Locking machines...") assert isinstance(config[0], int), "config[0] must be an integer" machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), "reserve_machines must be integer" assert reserved >= 0, "reserve_machines should >= 0" # change the status during the locking process report.try_push_job_info(ctx.config, dict(status="waiting")) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error("Error listing machines, trying again") time.sleep(20) continue else: raise RuntimeError("Error listing machines") # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith("scheduled"): if ctx.block: log.info( "waiting for more %s machines to be free (need %s + %s, have %s)...", machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, "not enough machines free; need %s + %s, have %s" % (reserved, requested, len(machines)) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( "{newly_locked} {mtype} machines locked this try, " "{total_locked}/{total_requested} locked so far".format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info("Waiting for virtual machines to come up") keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info("virtual machine is still unavailable") if loopcount == 40: loopcount = 0 log.info("virtual machine(s) still not up, " + "recreating unresponsive ones.") for guest in vmlist: if guest not in keys_dict.keys(): log.info("recreating: " + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats["ssh_pub_key"] ctx.config["targets"] = newscandict else: ctx.config["targets"] = all_locked locked_targets = yaml.safe_dump(ctx.config["targets"], default_flow_style=False).splitlines() log.info("\n ".join(["Locked targets:"] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status="running")) break elif not ctx.block: assert 0, "not enough machines are available" else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested ) ) log.warn("Could not lock enough machines, waiting...") time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ctx.config.get("unlock_on_failure", False) and not ctx.config.get("nuke-on-error", False) if get_status(ctx.summary) == "pass" or unlock_on_failure: log.info("Unlocking machines...") for machine in ctx.config["targets"].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def test_stale_openstack_instances(self): if 'OS_AUTH_URL' not in os.environ: pytest.skip('no OS_AUTH_URL environment variable') ctx = Mock() ctx.teuthology_config = config ctx.dry_run = False name = 'target1' uuid = 'UUID1' # # An instance created a second ago is left untouched, # even when it is not locked. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: 1, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, {}) m['destroy'].assert_not_called() # # An instance created a very long time ago is destroyed # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: 1000000000, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { misc.canonicalize_hostname(name, user=None): {}, }) m['destroy'].assert_called_with() # # An instance that turns out to not exist any longer # is ignored. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: False, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { misc.canonicalize_hostname(name, user=None): {}, }) m['destroy'].assert_not_called() # # An instance created but not locked after a while is # destroyed. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: nuke.OPENSTACK_DELAY + 1, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, {}) m['destroy'].assert_called_with() # # An instance created within the expected lifetime # of a job and locked is left untouched. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: nuke.OPENSTACK_DELAY + 1, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { misc.canonicalize_hostname(name, user=None): {}, }) m['destroy'].assert_not_called()
def lock_many(ctx, num, machine_type, user=None, description=None, os_type=None, os_version=None, arch=None): if user is None: user = misc.get_user() if not util.vps_version_or_type_valid(ctx.machine_type, os_type, os_version): log.error('Invalid os-type or version detected -- lock failed') return # In the for loop below we can safely query for all bare-metal machine_type # values at once. So, if we're being asked for 'plana,mira,burnupi', do it # all in one shot. If we are passed 'plana,mira,burnupi,vps', do one query # for 'plana,mira,burnupi' and one for 'vps' machine_types_list = misc.get_multi_machine_types(machine_type) if machine_types_list == ['vps']: machine_types = machine_types_list elif machine_types_list == ['openstack']: return lock_many_openstack(ctx, num, machine_type, user=user, description=description, arch=arch) elif 'vps' in machine_types_list: machine_types_non_vps = list(machine_types_list) machine_types_non_vps.remove('vps') machine_types_non_vps = '|'.join(machine_types_non_vps) machine_types = [machine_types_non_vps, 'vps'] else: machine_types_str = '|'.join(machine_types_list) machine_types = [ machine_types_str, ] for machine_type in machine_types: uri = os.path.join(config.lock_server, 'nodes', 'lock_many', '') data = dict( locked_by=user, count=num, machine_type=machine_type, description=description, ) # Only query for os_type/os_version if non-vps and non-libcloud, since # in that case we just create them. vm_types = ['vps'] + teuthology.provision.cloud.get_types() reimage_types = teuthology.provision.get_reimage_types() if machine_type not in vm_types + reimage_types: if os_type: data['os_type'] = os_type if os_version: data['os_version'] = os_version if arch: data['arch'] = arch log.debug("lock_many request: %s", repr(data)) response = requests.post( uri, data=json.dumps(data), headers={'content-type': 'application/json'}, ) if response.ok: machines = { misc.canonicalize_hostname(machine['name']): machine['ssh_pub_key'] for machine in response.json() } log.debug('locked {machines}'.format( machines=', '.join(machines.keys()))) if machine_type in vm_types: ok_machs = {} update_nodes(machines, True) for machine in machines: if teuthology.provision.create_if_vm(ctx, machine): ok_machs[machine] = machines[machine] else: log.error('Unable to create virtual machine: %s', machine) unlock_one(ctx, machine, user) ok_machs = do_update_keys(list(ok_machs.keys()))[1] update_nodes(ok_machs) return ok_machs elif machine_type in reimage_types: reimaged = dict() console_log_conf = dict( logfile_name='{shortname}_reimage.log', remotes=[ teuthology.orchestra.remote.Remote(machine) for machine in machines ], ) with console_log.task(ctx, console_log_conf): update_nodes(reimaged, True) with teuthology.parallel.parallel() as p: for machine in machines: p.spawn(teuthology.provision.reimage, ctx, machine, machine_type) reimaged[machine] = machines[machine] reimaged = do_update_keys(reimaged.keys())[1] update_nodes(reimaged) return reimaged return machines elif response.status_code == 503: log.error('Insufficient nodes available to lock %d %s nodes.', num, machine_type) log.error(response.text) else: log.error('Could not lock %d %s nodes, reason: unknown.', num, machine_type) return []
def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True): # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') reserved = config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' log.info('Locking machines...') # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = query.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested \ and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) try: newly_locked = lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch, reimage=reimage) except Exception: # Lock failures should map to the 'dead' status instead of 'fail' if 'summary' in ctx: set_status(ctx.summary, 'dead') raise all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if teuthology.lock.query.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) teuthology.provision.destroy_if_vm(ctx, full_name) teuthology.provision.create_if_vm(ctx, full_name) if teuthology.lock.ops.do_update_keys(keys_dict)[0]: log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.keys(): stats = teuthology.lock.query.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith( 'scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False).splitlines() log.info('\n '.join([ 'Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ get_status(ctx.summary) == 'pass': log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def test_stale_openstack_instances(self): ctx = Mock() ctx.teuthology_config = config ctx.dry_run = False name = 'target1' uuid = 'UUID1' # # An instance created a second ago is left untouched, # even when it is not locked. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: 1, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { }) m['destroy'].assert_not_called() # # An instance created a very long time ago is destroyed # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: 1000000000, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { misc.canonicalize_hostname(name, user=None): {}, }) m['destroy'].assert_called_with() # # An instance that turns out to not exist any longer # is ignored. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: False, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { misc.canonicalize_hostname(name, user=None): {}, }) m['destroy'].assert_not_called() # # An instance created but not locked after a while is # destroyed. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: nuke.OPENSTACK_DELAY + 1, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { }) m['destroy'].assert_called_with() # # An instance created within the expected lifetime # of a job and locked is left untouched. # with patch.multiple( nuke.OpenStackInstance, exists=lambda _: True, get_created=lambda _: nuke.OPENSTACK_DELAY + 1, __getitem__=lambda _, key: name, destroy=DEFAULT, ) as m: nuke.stale_openstack_instances(ctx, { uuid: { 'Name': name, }, }, { misc.canonicalize_hostname(name, user=None): {}, }) m['destroy'].assert_not_called()
def test_canonicalize_hostname_otherlab(self): config.lab_domain = 'example.com' host_base = 'box1' result = misc.canonicalize_hostname(host_base) assert result == '*****@*****.**'
def test_canonicalize_hostname_nodomain(self): config.lab_domain = '' host = 'box2' result = misc.canonicalize_hostname(host) assert result == 'ubuntu@' + host
def test_canonicalize_hostname(self): host_base = 'box1' result = misc.canonicalize_hostname(host_base) assert result == '*****@*****.**'
def test_canonicalize_hostname_nouser(self): host_base = 'box1' result = misc.canonicalize_hostname(host_base, user=None) assert result == 'box1.front.sepia.ceph.com'
def lock_many(ctx, num, machine_type, user=None, description=None, os_type=None, os_version=None, arch=None): if user is None: user = misc.get_user() if not util.vps_version_or_type_valid( ctx.machine_type, os_type, os_version ): log.error('Invalid os-type or version detected -- lock failed') return # In the for loop below we can safely query for all bare-metal machine_type # values at once. So, if we're being asked for 'plana,mira,burnupi', do it # all in one shot. If we are passed 'plana,mira,burnupi,vps', do one query # for 'plana,mira,burnupi' and one for 'vps' machine_types_list = misc.get_multi_machine_types(machine_type) if machine_types_list == ['vps']: machine_types = machine_types_list elif machine_types_list == ['openstack']: return lock_many_openstack(ctx, num, machine_type, user=user, description=description, arch=arch) elif 'vps' in machine_types_list: machine_types_non_vps = list(machine_types_list) machine_types_non_vps.remove('vps') machine_types_non_vps = '|'.join(machine_types_non_vps) machine_types = [machine_types_non_vps, 'vps'] else: machine_types_str = '|'.join(machine_types_list) machine_types = [machine_types_str, ] for machine_type in machine_types: uri = os.path.join(config.lock_server, 'nodes', 'lock_many', '') data = dict( locked_by=user, count=num, machine_type=machine_type, description=description, ) # Only query for os_type/os_version if non-vps and non-libcloud, since # in that case we just create them. vm_types = ['vps'] + teuthology.provision.cloud.get_types() reimage_types = teuthology.provision.fog.get_types() if machine_type not in vm_types + reimage_types: if os_type: data['os_type'] = os_type if os_version: data['os_version'] = os_version if arch: data['arch'] = arch log.debug("lock_many request: %s", repr(data)) response = requests.post( uri, data=json.dumps(data), headers={'content-type': 'application/json'}, ) if response.ok: machines = {misc.canonicalize_hostname(machine['name']): machine['ssh_pub_key'] for machine in response.json()} log.debug('locked {machines}'.format( machines=', '.join(machines.keys()))) if machine_type in vm_types: ok_machs = {} for machine in machines: if teuthology.provision.create_if_vm(ctx, machine): ok_machs[machine] = machines[machine] else: log.error('Unable to create virtual machine: %s', machine) unlock_one(ctx, machine, user) ok_machs = keys.do_update_keys(ok_machs.keys())[1] return ok_machs elif machine_type in reimage_types: reimaged = dict() console_log_conf = dict( logfile_name='{shortname}_reimage.log', remotes=[teuthology.orchestra.remote.Remote(machine) for machine in machines], ) with console_log.task( ctx, console_log_conf): with teuthology.parallel.parallel() as p: for machine in machines: p.spawn(teuthology.provision.reimage, ctx, machine) reimaged[machine] = machines[machine] reimaged = keys.do_update_keys(reimaged.keys())[1] return reimaged return machines elif response.status_code == 503: log.error('Insufficient nodes available to lock %d %s nodes.', num, machine_type) log.error(response.text) else: log.error('Could not lock %d %s nodes, reason: unknown.', num, machine_type) return []
def test_canonicalize_hostname_full_other_user(self): config.lab_domain = 'example.com' host = '*****@*****.**' result = misc.canonicalize_hostname(host) assert result == '*****@*****.**'
def main(ctx): if ctx.verbose: teuthology.log.setLevel(logging.DEBUG) set_config_attr(ctx) ret = 0 user = ctx.owner machines = [misc.canonicalize_hostname(m, user=False) for m in ctx.machines] machines_to_update = [] if ctx.targets: try: with file(ctx.targets) as f: g = yaml.safe_load_all(f) for new in g: if 'targets' in new: for t in new['targets'].iterkeys(): machines.append(t) except IOError as e: raise argparse.ArgumentTypeError(str(e)) if ctx.f: assert ctx.lock or ctx.unlock, \ '-f is only supported by --lock and --unlock' if machines: assert ctx.lock or ctx.unlock or ctx.list or ctx.list_targets \ or ctx.update or ctx.brief, \ 'machines cannot be specified with that operation' else: if ctx.lock: log.error("--lock requires specific machines passed as arguments") else: # This condition might never be hit, but it's not clear. assert ctx.num_to_lock or ctx.list or ctx.list_targets or \ ctx.summary or ctx.brief, \ 'machines must be specified for that operation' if ctx.all: assert ctx.list or ctx.list_targets or ctx.brief, \ '--all can only be used with --list, --list-targets, and --brief' assert ctx.owner is None, \ '--all and --owner are mutually exclusive' assert not machines, \ '--all and listing specific machines are incompatible' if ctx.num_to_lock: assert ctx.machine_type, \ 'must specify machine type to lock' if ctx.brief or ctx.list or ctx.list_targets: assert ctx.desc is None, '--desc does nothing with --list/--brief' # we may need to update host keys for vms. Don't do it for # every vm; however, update any vms included in the list given # to the CLI (machines), or any owned by the specified owner or # invoking user if no machines are specified. vmachines = [] statuses = query.get_statuses(machines) owner = ctx.owner or misc.get_user() for machine in statuses: if query.is_vm(status=machine) and machine['locked'] and \ (machines or machine['locked_by'] == owner): vmachines.append(machine['name']) if vmachines: log.info("updating host keys for %s", ' '.join(sorted(vmachines))) keys.do_update_keys(vmachines, _raise=False) # get statuses again to refresh any updated keys statuses = query.get_statuses(machines) if statuses: statuses = util.winnow(statuses, ctx.machine_type, 'machine_type') if not machines and ctx.owner is None and not ctx.all: ctx.owner = misc.get_user() statuses = util.winnow(statuses, ctx.owner, 'locked_by') statuses = util.winnow(statuses, ctx.status, 'up', lambda s: s['up'] == (ctx.status == 'up')) statuses = util.winnow(statuses, ctx.locked, 'locked', lambda s: s['locked'] == (ctx.locked == 'true')) statuses = util.winnow(statuses, ctx.desc, 'description') statuses = util.winnow(statuses, ctx.desc_pattern, 'description', lambda s: s['description'] and \ ctx.desc_pattern in s['description']) if ctx.json_query: statuses = util.json_matching_statuses(ctx.json_query, statuses) statuses = util.winnow(statuses, ctx.os_type, 'os_type') statuses = util.winnow(statuses, ctx.os_version, 'os_version') # When listing, only show the vm_host's name, not every detail for s in statuses: if not query.is_vm(status=s): continue # with an OpenStack API, there is no host for a VM if s['vm_host'] is None: continue vm_host_name = s.get('vm_host', dict())['name'] if vm_host_name: s['vm_host'] = vm_host_name if ctx.list: print json.dumps(statuses, indent=4) elif ctx.brief: for s in sorted(statuses, key=lambda s: s.get('name')): locked = "un" if s['locked'] == 0 else " " mo = re.match('\w+@(\w+?)\..*', s['name']) host = mo.group(1) if mo else s['name'] print '{host} {locked}locked {owner} "{desc}"'.format( locked=locked, host=host, owner=s['locked_by'], desc=s['description']) else: frag = {'targets': {}} for f in statuses: frag['targets'][f['name']] = f['ssh_pub_key'] print yaml.safe_dump(frag, default_flow_style=False) else: log.error('error retrieving lock statuses') ret = 1 elif ctx.summary: do_summary(ctx) return 0 elif ctx.lock: if not util.vps_version_or_type_valid( ctx.machine_type, ctx.os_type, ctx.os_version): log.error('Invalid os-type or version detected -- lock failed') return 1 reimage_types = teuthology.provision.fog.get_types() reimage_machines = list() updatekeys_machines = list() for machine in machines: resp = ops.lock_one(machine, user, ctx.desc) if resp.ok: machine_status = resp.json() machine_type = machine_status['machine_type'] if not resp.ok: ret = 1 if not ctx.f: return ret elif not query.is_vm(machine, machine_status): if machine_type in reimage_types: # Reimage in parallel just below here reimage_machines.append(machine) # Update keys last updatekeys_machines = list() else: machines_to_update.append(machine) teuthology.provision.create_if_vm( ctx, misc.canonicalize_hostname(machine), ) with teuthology.parallel.parallel() as p: for machine in reimage_machines: p.spawn(teuthology.provision.reimage, ctx, machine) for machine in updatekeys_machines: keys.do_update_keys([machine]) elif ctx.unlock: if ctx.owner is None and user is None: user = misc.get_user() # If none of them are vpm, do them all in one shot if not filter(query.is_vm, machines): res = ops.unlock_many(machines, user) return 0 if res else 1 for machine in machines: if not ops.unlock_one(ctx, machine, user): ret = 1 if not ctx.f: return ret else: machines_to_update.append(machine) elif ctx.num_to_lock: result = ops.lock_many(ctx, ctx.num_to_lock, ctx.machine_type, user, ctx.desc, ctx.os_type, ctx.os_version, ctx.arch) if not result: ret = 1 else: machines_to_update = result.keys() if ctx.machine_type == 'vps': shortnames = ' '.join( [misc.decanonicalize_hostname(name) for name in result.keys()] ) if len(result) < ctx.num_to_lock: log.error("Locking failed.") for machine in result: ops.unlock_one(ctx, machine, user) ret = 1 else: log.info("Successfully Locked:\n%s\n" % shortnames) log.info( "Unable to display keys at this time (virtual " + "machines are booting).") log.info( "Please run teuthology-lock --list-targets %s once " + "these machines come up.", shortnames) else: print yaml.safe_dump( dict(targets=result), default_flow_style=False) elif ctx.update: assert ctx.desc is not None or ctx.status is not None, \ 'you must specify description or status to update' assert ctx.owner is None, 'only description and status may be updated' machines_to_update = machines if ctx.desc is not None or ctx.status is not None: for machine in machines_to_update: ops.update_lock(machine, ctx.desc, ctx.status) return ret
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = teuth_config.reserve_machines assert isinstance(to_reserve, int), 'reserve_machines must be integer' assert (to_reserve >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: how_many = how_many - len(newly_locked) assert how_many > 0, "lock_machines: how_many counter went" \ "negative, this shouldn't happen" log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def main(ctx): if ctx.verbose: teuthology.log.setLevel(logging.DEBUG) set_config_attr(ctx) ret = 0 user = ctx.owner machines = [ misc.canonicalize_hostname(m, user=False) for m in ctx.machines ] machines_to_update = [] if ctx.targets: try: with open(ctx.targets) as f: g = yaml.safe_load_all(f) for new in g: if 'targets' in new: for t in new['targets'].iterkeys(): machines.append(t) except IOError as e: raise argparse.ArgumentTypeError(str(e)) if ctx.f: assert ctx.lock or ctx.unlock, \ '-f is only supported by --lock and --unlock' if machines: assert ctx.lock or ctx.unlock or ctx.list or ctx.list_targets \ or ctx.update or ctx.brief, \ 'machines cannot be specified with that operation' else: if ctx.lock: log.error("--lock requires specific machines passed as arguments") else: # This condition might never be hit, but it's not clear. assert ctx.num_to_lock or ctx.list or ctx.list_targets or \ ctx.summary or ctx.brief, \ 'machines must be specified for that operation' if ctx.all: assert ctx.list or ctx.list_targets or ctx.brief, \ '--all can only be used with --list, --list-targets, and --brief' assert ctx.owner is None, \ '--all and --owner are mutually exclusive' assert not machines, \ '--all and listing specific machines are incompatible' if ctx.num_to_lock: assert ctx.machine_type, \ 'must specify machine type to lock' if ctx.brief or ctx.list or ctx.list_targets: assert ctx.desc is None, '--desc does nothing with --list/--brief' # we may need to update host keys for vms. Don't do it for # every vm; however, update any vms included in the list given # to the CLI (machines), or any owned by the specified owner or # invoking user if no machines are specified. vmachines = [] statuses = query.get_statuses(machines) owner = ctx.owner or misc.get_user() for machine in statuses: if query.is_vm(status=machine) and machine['locked'] and \ (machines or machine['locked_by'] == owner): vmachines.append(machine['name']) if vmachines: log.info("updating host keys for %s", ' '.join(sorted(vmachines))) keys.do_update_keys(vmachines, _raise=False) # get statuses again to refresh any updated keys statuses = query.get_statuses(machines) if statuses: statuses = util.winnow(statuses, ctx.machine_type, 'machine_type') if not machines and ctx.owner is None and not ctx.all: ctx.owner = misc.get_user() statuses = util.winnow(statuses, ctx.owner, 'locked_by') statuses = util.winnow(statuses, ctx.status, 'up', lambda s: s['up'] == (ctx.status == 'up')) statuses = util.winnow( statuses, ctx.locked, 'locked', lambda s: s['locked'] == (ctx.locked == 'true')) statuses = util.winnow(statuses, ctx.desc, 'description') statuses = util.winnow(statuses, ctx.desc_pattern, 'description', lambda s: s['description'] and \ ctx.desc_pattern in s['description']) if ctx.json_query: statuses = util.json_matching_statuses(ctx.json_query, statuses) statuses = util.winnow(statuses, ctx.os_type, 'os_type') statuses = util.winnow(statuses, ctx.os_version, 'os_version') # When listing, only show the vm_host's name, not every detail for s in statuses: if not query.is_vm(status=s): continue # with an OpenStack API, there is no host for a VM if s['vm_host'] is None: continue vm_host_name = s.get('vm_host', dict())['name'] if vm_host_name: s['vm_host'] = vm_host_name if ctx.list: print json.dumps(statuses, indent=4) elif ctx.brief: for s in sorted(statuses, key=lambda s: s.get('name')): locked = "un" if s['locked'] == 0 else " " mo = re.match('\w+@(\w+?)\..*', s['name']) host = mo.group(1) if mo else s['name'] print '{host} {locked}locked {owner} "{desc}"'.format( locked=locked, host=host, owner=s['locked_by'], desc=s['description']) else: frag = {'targets': {}} for f in statuses: frag['targets'][f['name']] = f['ssh_pub_key'] print yaml.safe_dump(frag, default_flow_style=False) else: log.error('error retrieving lock statuses') ret = 1 elif ctx.summary: do_summary(ctx) return 0 elif ctx.lock: if not util.vps_version_or_type_valid(ctx.machine_type, ctx.os_type, ctx.os_version): log.error('Invalid os-type or version detected -- lock failed') return 1 reimage_types = teuthology.provision.fog.get_types() reimage_machines = list() updatekeys_machines = list() for machine in machines: resp = ops.lock_one(machine, user, ctx.desc) if resp.ok: machine_status = resp.json() machine_type = machine_status['machine_type'] if not resp.ok: ret = 1 if not ctx.f: return ret elif not query.is_vm(machine, machine_status): if machine_type in reimage_types: # Reimage in parallel just below here reimage_machines.append(machine) # Update keys last updatekeys_machines = list() else: machines_to_update.append(machine) teuthology.provision.create_if_vm( ctx, misc.canonicalize_hostname(machine), ) with teuthology.parallel.parallel() as p: for machine in reimage_machines: p.spawn(teuthology.provision.reimage, ctx, machine) for machine in updatekeys_machines: keys.do_update_keys([machine]) elif ctx.unlock: if ctx.owner is None and user is None: user = misc.get_user() # If none of them are vpm, do them all in one shot if not filter(query.is_vm, machines): res = ops.unlock_many(machines, user) return 0 if res else 1 for machine in machines: if not ops.unlock_one(ctx, machine, user): ret = 1 if not ctx.f: return ret else: machines_to_update.append(machine) elif ctx.num_to_lock: result = ops.lock_many(ctx, ctx.num_to_lock, ctx.machine_type, user, ctx.desc, ctx.os_type, ctx.os_version, ctx.arch) if not result: ret = 1 else: machines_to_update = result.keys() if ctx.machine_type == 'vps': shortnames = ' '.join([ misc.decanonicalize_hostname(name) for name in result.keys() ]) if len(result) < ctx.num_to_lock: log.error("Locking failed.") for machine in result: ops.unlock_one(ctx, machine, user) ret = 1 else: log.info("Successfully Locked:\n%s\n" % shortnames) log.info("Unable to display keys at this time (virtual " + "machines are booting).") log.info( "Please run teuthology-lock --list-targets %s once " + "these machines come up.", shortnames) else: print yaml.safe_dump(dict(targets=result), default_flow_style=False) elif ctx.update: assert ctx.desc is not None or ctx.status is not None, \ 'you must specify description or status to update' assert ctx.owner is None, 'only description and status may be updated' machines_to_update = machines if ctx.desc is not None or ctx.status is not None: for machine in machines_to_update: ops.update_lock(machine, ctx.desc, ctx.status) return ret
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) <= to_reserve and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s see %s)...', how_many, len(machines), ) time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked # FIXME: Ugh. log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner)