def reimage(tpu, zone, project, version, yes, dry_run, async_, silent): """Reimages the OS on a TPU. Equivalent to `gcloud compute tpus reimage`""" for choice in tpus: for tpu in tpunicorn.get_tpu(choice, zone=zone, project=project, silent=silent, multi=True): reimage = tpunicorn.reimage_tpu_command(tpu, zone=zone, project=project, version=version, async_=async_) def wait(): wait_healthy(tpu, zone=zone, project=project) if not yes: print_step('Step 1: reimage TPU.', reimage) if not async_: print_step('Step 2: wait until TPU is HEALTHY.', wait) if not click.confirm( 'Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: reimage TPU...', reimage, dry_run=dry_run) if not async_: do_step('Step 2: wait for TPU to become HEALTHY...', wait, dry_run=dry_run) click.echo('TPU {} {} ready for training.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is'))
def start(tpus, zone, project, yes, dry_run, async_, silent): """Start TPU. Equivalent to `gcloud compute tpus start`""" for choice in tpus: for tpu in tpunicorn.get_tpu(choice, zone=zone, project=project, silent=silent, multi=True): click.echo('Current status of TPU:') print_tpu_status_headers() print_tpu_status(tpu) click.echo('') stop = tpunicorn.stop_tpu_command(tpu, zone=zone, project=project, async_=async_) start = tpunicorn.start_tpu_command(tpu, zone=zone, project=project, async_=async_) if not yes: print_step('Step 1: start TPU.', start) if not click.confirm( 'Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: start TPU...', start, dry_run=dry_run) click.echo('TPU {} {} start{}.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is', 'ing' if async_ else 'ed')) print_step( 'You {} stop the TPU with:'.format( 'could then' if dry_run else 'can'), stop)
def check_healthy(tpu, zone=None, project=None, color=True, noisy=True): tpu = tpunicorn.get_tpu(tpu, zone=zone, project=project) if noisy: print_tpu_status(tpu, color=color) status = tpunicorn.format(tpu, '{status}') health = tpunicorn.format(tpu, '{health}') if status == 'READY' and health == 'HEALTHY': return True return False
def list_tpus(zone, project, format, color, tpu, silent): """List TPUs.""" tpus = tpu if len(tpus) <= 0: print_tpus_status(zone=zone, project=project, format=format, color=color) else: if format == 'text': print_tpu_status_headers() for tpu in tpus: tpu = tpunicorn.get_tpu(tpu, zone=zone, project=project, silent=silent) if tpu is not None: print_tpu_status(tpu, format=format, color=color)
def start(tpu, zone, project, yes, dry_run, async_): tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project) click.echo('Current status of TPU:') print_tpu_status_headers() print_tpu_status(tpu) click.echo('') stop = tpunicorn.stop_tpu_command(tpu, zone=zone, project=project, async_=async_) start = tpunicorn.start_tpu_command(tpu, zone=zone, project=project, async_=async_) if not yes: print_step('Step 1: start TPU.', start) if not click.confirm('Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: start TPU...', start, dry_run=dry_run) click.echo('TPU {} {} started.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is')) print_step('You {} stop the TPU with:'.format('could then' if dry_run else 'can'), stop)
def delete(tpu, zone, project, yes, dry_run, async_): tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project) click.echo('Current status of TPU:') print_tpu_status_headers() print_tpu_status(tpu) click.echo('') delete = tpunicorn.delete_tpu_command(tpu, zone=zone, project=project, async_=async_) create = tpunicorn.create_tpu_command(tpu, zone=zone, project=project, async_=async_) if not yes: print_step('Step 1: delete TPU.', delete) if not click.confirm('Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: delete TPU...', delete, dry_run=dry_run) click.echo('TPU {} {} deleted.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is')) print_step('You {} recreate the TPU with:'.format('could then' if dry_run else 'can'), create)
def recreate(tpu, zone, project, version, yes, dry_run, preempted, command, **kws): """ Recreates a TPU, optionally switching the system software to the specified TF_VERSION. """ tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project) click.echo('Current status of TPU {} as of {}:'.format( tpunicorn.tpu.parse_tpu_id(tpu), tpunicorn.tpu.get_timestamp())) print_tpu_status_headers() print_tpu_status(tpu) if preempted and not is_preempted(tpu, zone=zone, project=project): return click.echo('') delete = tpunicorn.delete_tpu_command(tpu, zone=zone, project=project) create = tpunicorn.create_tpu_command(tpu, zone=zone, project=project, version=version) def wait(): wait_healthy(tpu, zone=zone, project=project) if not yes: print_step('Step 1: delete TPU.', delete) print_step('Step 2: create TPU.', create) print_step('Step 3: wait until TPU is HEALTHY.', wait) if len(command) > 0: for i, cmd in enumerate(command): print_step('Step {}: run this command:'.format(i + 4), cmd) if not click.confirm( 'Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: delete TPU...', delete, dry_run=dry_run) do_step('Step 2: create TPU...', create, dry_run=dry_run) do_step('Step 3: wait for TPU to become HEALTHY...', wait, dry_run=dry_run) if len(command) > 0: for i, cmd in enumerate(command): do_step('Step {}: running command...'.format(i + 4), cmd, dry_run=dry_run) click.echo('TPU {} {} ready for training.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is'))
def reimage(tpu, zone, project, version, yes, dry_run): """Reimages the OS on a TPU.""" tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project) reimage = tpunicorn.reimage_tpu_command(tpu, zone=zone, project=project, version=version) def wait(): wait_healthy(tpu, zone=zone, project=project) if not yes: print_step('Step 1: reimage TPU.', reimage) print_step('Step 2: wait until TPU is HEALTHY.', wait) if not click.confirm( 'Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: reimage TPU...', reimage, dry_run=dry_run) do_step('Step 2: wait for TPU to become HEALTHY...', wait, dry_run=dry_run) click.echo('TPU {} {} ready for training.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is'))
def recreate(tpu, zone, project, version, yes, dry_run, preempted, command, retry, retry_randomness, **kws): """ Recreates a TPU, optionally switching the system software to the specified TF_VERSION. """ tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project) click.echo('Current status of TPU {} as of {}:'.format(tpunicorn.tpu.parse_tpu_id(tpu), tpunicorn.tpu.get_timestamp())) print_tpu_status_headers() print_tpu_status(tpu) if preempted and not is_preempted(tpu, zone=zone, project=project): return click.echo('') delete = tpunicorn.delete_tpu_command(tpu, zone=zone, project=project) create = tpunicorn.create_tpu_command(tpu, zone=zone, project=project, version=version) def wait(): wait_healthy(tpu, zone=zone, project=project) if not yes: print_step('Step 1: delete TPU.', delete) print_step('Step 2: create TPU.', create) print_step('Step 3: wait until TPU is HEALTHY.', wait) if len(command) > 0: for i, cmd in enumerate(command): print_step('Step {}: run this command:'.format(i+4), cmd) if not click.confirm('Proceed? {}'.format('(dry run)' if dry_run else '')): return do_step('Step 1: delete TPU...', delete, dry_run=dry_run) while do_step('Step 2: create TPU...', create, dry_run=dry_run) != 0: if retry is None: click.echo('TPU {} failed to create (is the region out of capacity?)'.format(tpunicorn.tpu.parse_tpu_id(tpu)), err=True) break n = random.uniform(1, retry_randomness) click.echo('TPU {} failed to create; trying again in {} minutes...'.format(tpunicorn.tpu.parse_tpu_id(tpu), int((retry * n)//60)), err=True) time.sleep(retry * n) do_step('Step 3: wait for TPU to become HEALTHY...', wait, dry_run=dry_run) if len(command) > 0: for i, cmd in enumerate(command): do_step('Step {}: running command...'.format(i+4), cmd, dry_run=dry_run) click.echo('TPU {} {} ready for training.'.format( tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is'))
def top(tpus, zone, project, color, silent): """Repeatedly print info about TPUs.""" format = 'text' while True: click.clear() if len(tpus) <= 0: print_tpus_status(zone=zone, project=project, format=format, color=color) else: if format == 'text': print_tpu_status_headers() for choice in tpus: for tpu in tpunicorn.get_tpu(choice, zone=zone, project=project, silent=silent, multi=True): if tpu is not None: print_tpu_status(tpu, format=format, color=color) time.sleep(5.0)
def list_tpus(tpus, zone, project, format, color, tpu, silent): """List TPUs.""" if len(tpu) > 0: logging.warn( '-t/--tpu argument is deprecated; you can just pass in the TPU IDs directly now. (in other words, just remove -t or --tpu from your script)' ) tpus = list(tpus) tpus.extend(tpu) if len(tpus) <= 0: print_tpus_status(zone=zone, project=project, format=format, color=color) else: if format == 'text': print_tpu_status_headers() for choice in tpus: for tpu in tpunicorn.get_tpu(choice, zone=zone, project=project, silent=silent, multi=True): if tpu is not None: print_tpu_status(tpu, format=format, color=color)
def is_preempted(tpu, zone=None, project=None): tpu = tpunicorn.get_tpu(tpu, zone=zone, project=project) status = tpunicorn.format(tpu, '{status}') return status == 'PREEMPTED'