def do_summary(ctx): lockd = collections.defaultdict(lambda: [0, 0, 'unknown']) if ctx.machine_type: locks = query.list_locks(machine_type=ctx.machine_type) else: locks = query.list_locks() for l in locks: who = l['locked_by'] if l['locked'] == 1 \ else '(free)', l['machine_type'] lockd[who][0] += 1 lockd[who][1] += 1 if l['up'] else 0 lockd[who][2] = l['machine_type'] locks = sorted([p for p in lockd.items()], key=lambda sort: (sort[1][2], sort[1][0])) total_count, total_up = 0, 0 print "TYPE COUNT UP OWNER" for (owner, (count, upcount, machinetype)) in locks: # if machinetype == spectype: print "{machinetype:8s} {count:3d} {up:3d} {owner}".format( count=count, up=upcount, owner=owner[0], machinetype=machinetype) total_count += count total_up += upcount print " --- ---" print "{cnt:12d} {up:3d}".format(cnt=total_count, up=total_up)
def do_summary(ctx): lockd = collections.defaultdict(lambda: [0, 0, 'unknown']) if ctx.machine_type: locks = query.list_locks(machine_type=ctx.machine_type) else: locks = query.list_locks() for l in locks: who = l['locked_by'] if l['locked'] == 1 \ else '(free)', l['machine_type'] lockd[who][0] += 1 lockd[who][1] += 1 if l['up'] else 0 lockd[who][2] = l['machine_type'] locks = sorted([p for p in lockd.iteritems() ], key=lambda sort: (sort[1][2], sort[1][0])) total_count, total_up = 0, 0 print "TYPE COUNT UP OWNER" for (owner, (count, upcount, machinetype)) in locks: # if machinetype == spectype: print "{machinetype:8s} {count:3d} {up:3d} {owner}".format( count=count, up=upcount, owner=owner[0], machinetype=machinetype) total_count += count total_up += upcount print " --- ---" print "{cnt:12d} {up:3d}".format(cnt=total_count, up=total_up)
def stale_openstack(ctx): targets = dict(map(lambda i: (i['ID'], i), OpenStack.list_instances())) nodes = list_locks(keyed_by_name=True, locked=True) stale_openstack_instances(ctx, targets, nodes) stale_openstack_nodes(ctx, targets, nodes) stale_openstack_volumes(ctx, OpenStack.list_volumes()) if not ctx.dry_run: openstack_remove_again()
def nuke(ctx, should_unlock, sync_clocks=True, noipmi=False, keep_logs=False, should_reboot=True): if 'targets' not in ctx.config: return total_unnuked = {} targets = dict(ctx.config['targets']) if ctx.name: log.info('Checking targets against current locks') locks = list_locks() # Remove targets who's description doesn't match archive name. for lock in locks: for target in targets: if target == lock['name']: if ctx.name not in lock['description']: del ctx.config['targets'][lock['name']] log.info( "Not nuking %s because description doesn't match", lock['name']) elif lock.get('up') is False: del ctx.config['targets'][lock['name']] log.info( "Not nuking %s because it is down", lock['name']) with parallel() as p: for target, hostkey in ctx.config['targets'].items(): p.spawn( nuke_one, ctx, {target: hostkey}, should_unlock, sync_clocks, ctx.config.get('check-locks', True), noipmi, keep_logs, should_reboot, ) for unnuked in p: if unnuked: total_unnuked.update(unnuked) if total_unnuked: log.error('Could not nuke the following targets:\n' + '\n '.join(['targets:', ] + yaml.safe_dump( total_unnuked, default_flow_style=False).splitlines()))
def nuke(ctx, should_unlock, sync_clocks=True, reboot_all=True, noipmi=False): if 'targets' not in ctx.config: return total_unnuked = {} targets = dict(ctx.config['targets']) if ctx.name: log.info('Checking targets against current locks') locks = list_locks() # Remove targets who's description doesn't match archive name. for lock in locks: for target in targets: if target == lock['name']: if ctx.name not in lock['description']: del ctx.config['targets'][lock['name']] log.info( "Not nuking %s because description doesn't match", lock['name']) with parallel() as p: for target, hostkey in ctx.config['targets'].iteritems(): p.spawn( nuke_one, ctx, {target: hostkey}, should_unlock, sync_clocks, reboot_all, ctx.config.get('check-locks', True), noipmi, ) for unnuked in p: if unnuked: total_unnuked.update(unnuked) if total_unnuked: log.error('Could not nuke the following targets:\n' + '\n '.join(['targets:', ] + yaml.safe_dump( total_unnuked, default_flow_style=False).splitlines()))
def do_update_keys(machines, all_=False, _raise=True): reference = query.list_locks(keyed_by_name=True) if all_: machines = reference.keys() keys_dict = misc.ssh_keyscan(machines, _raise=_raise) return push_new_keys(keys_dict, reference), keys_dict
def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True): # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') reserved = config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' log.info('Locking machines...') # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = query.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested \ and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) try: newly_locked = lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch, reimage=reimage) except Exception: # Lock failures should map to the 'dead' status instead of 'fail' if 'summary' in ctx: set_status(ctx.summary, 'dead') raise all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if teuthology.lock.query.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) teuthology.provision.destroy_if_vm(ctx, full_name) teuthology.provision.create_if_vm(ctx, full_name) if teuthology.lock.ops.do_update_keys(keys_dict)[0]: log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.keys(): stats = teuthology.lock.query.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10)