Exemple #1
0
def reimage(tpu, zone, project, version, yes, dry_run, async_, silent):
    """Reimages the OS on a TPU. Equivalent to `gcloud compute tpus reimage`"""
    for choice in tpus:
        for tpu in tpunicorn.get_tpu(choice,
                                     zone=zone,
                                     project=project,
                                     silent=silent,
                                     multi=True):
            reimage = tpunicorn.reimage_tpu_command(tpu,
                                                    zone=zone,
                                                    project=project,
                                                    version=version,
                                                    async_=async_)

            def wait():
                wait_healthy(tpu, zone=zone, project=project)

            if not yes:
                print_step('Step 1: reimage TPU.', reimage)
                if not async_:
                    print_step('Step 2: wait until TPU is HEALTHY.', wait)
                if not click.confirm(
                        'Proceed? {}'.format('(dry run)' if dry_run else '')):
                    return
            do_step('Step 1: reimage TPU...', reimage, dry_run=dry_run)
            if not async_:
                do_step('Step 2: wait for TPU to become HEALTHY...',
                        wait,
                        dry_run=dry_run)
                click.echo('TPU {} {} ready for training.'.format(
                    tpunicorn.tpu.parse_tpu_id(tpu),
                    'would be' if dry_run else 'is'))
Exemple #2
0
def start(tpus, zone, project, yes, dry_run, async_, silent):
    """Start TPU. Equivalent to `gcloud compute tpus start`"""
    for choice in tpus:
        for tpu in tpunicorn.get_tpu(choice,
                                     zone=zone,
                                     project=project,
                                     silent=silent,
                                     multi=True):
            click.echo('Current status of TPU:')
            print_tpu_status_headers()
            print_tpu_status(tpu)
            click.echo('')
            stop = tpunicorn.stop_tpu_command(tpu,
                                              zone=zone,
                                              project=project,
                                              async_=async_)
            start = tpunicorn.start_tpu_command(tpu,
                                                zone=zone,
                                                project=project,
                                                async_=async_)
            if not yes:
                print_step('Step 1: start TPU.', start)
                if not click.confirm(
                        'Proceed? {}'.format('(dry run)' if dry_run else '')):
                    return
            do_step('Step 1: start TPU...', start, dry_run=dry_run)
            click.echo('TPU {} {} start{}.'.format(
                tpunicorn.tpu.parse_tpu_id(tpu),
                'would be' if dry_run else 'is', 'ing' if async_ else 'ed'))
            print_step(
                'You {} stop the TPU with:'.format(
                    'could then' if dry_run else 'can'), stop)
Exemple #3
0
def check_healthy(tpu, zone=None, project=None, color=True, noisy=True):
  tpu = tpunicorn.get_tpu(tpu, zone=zone, project=project)
  if noisy:
    print_tpu_status(tpu, color=color)
  status = tpunicorn.format(tpu, '{status}')
  health = tpunicorn.format(tpu, '{health}')
  if status == 'READY' and health == 'HEALTHY':
    return True
  return False
Exemple #4
0
def list_tpus(zone, project, format, color, tpu, silent):
  """List TPUs."""
  tpus = tpu
  if len(tpus) <= 0:
    print_tpus_status(zone=zone, project=project, format=format, color=color)
  else:
    if format == 'text':
      print_tpu_status_headers()
    for tpu in tpus:
      tpu = tpunicorn.get_tpu(tpu, zone=zone, project=project, silent=silent)
      if tpu is not None:
        print_tpu_status(tpu, format=format, color=color)
Exemple #5
0
def start(tpu, zone, project, yes, dry_run, async_):
  tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project)
  click.echo('Current status of TPU:')
  print_tpu_status_headers()
  print_tpu_status(tpu)
  click.echo('')
  stop = tpunicorn.stop_tpu_command(tpu, zone=zone, project=project, async_=async_)
  start = tpunicorn.start_tpu_command(tpu, zone=zone, project=project, async_=async_)
  if not yes:
    print_step('Step 1: start TPU.', start)
    if not click.confirm('Proceed? {}'.format('(dry run)' if dry_run else '')):
      return
  do_step('Step 1: start TPU...', start, dry_run=dry_run)
  click.echo('TPU {} {} started.'.format(
    tpunicorn.tpu.parse_tpu_id(tpu),
    'would be' if dry_run else 'is'))
  print_step('You {} stop the TPU with:'.format('could then' if dry_run else 'can'),
    stop)
Exemple #6
0
def delete(tpu, zone, project, yes, dry_run, async_):
  tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project)
  click.echo('Current status of TPU:')
  print_tpu_status_headers()
  print_tpu_status(tpu)
  click.echo('')
  delete = tpunicorn.delete_tpu_command(tpu, zone=zone, project=project, async_=async_)
  create = tpunicorn.create_tpu_command(tpu, zone=zone, project=project, async_=async_)
  if not yes:
    print_step('Step 1: delete TPU.', delete)
    if not click.confirm('Proceed? {}'.format('(dry run)' if dry_run else '')):
      return
  do_step('Step 1: delete TPU...', delete, dry_run=dry_run)
  click.echo('TPU {} {} deleted.'.format(
    tpunicorn.tpu.parse_tpu_id(tpu),
    'would be' if dry_run else 'is'))
  print_step('You {} recreate the TPU with:'.format('could then' if dry_run else 'can'),
    create)
Exemple #7
0
def recreate(tpu, zone, project, version, yes, dry_run, preempted, command,
             **kws):
    """
  Recreates a TPU, optionally switching the system software to the specified TF_VERSION.
  """
    tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project)
    click.echo('Current status of TPU {} as of {}:'.format(
        tpunicorn.tpu.parse_tpu_id(tpu), tpunicorn.tpu.get_timestamp()))
    print_tpu_status_headers()
    print_tpu_status(tpu)
    if preempted and not is_preempted(tpu, zone=zone, project=project):
        return
    click.echo('')
    delete = tpunicorn.delete_tpu_command(tpu, zone=zone, project=project)
    create = tpunicorn.create_tpu_command(tpu,
                                          zone=zone,
                                          project=project,
                                          version=version)

    def wait():
        wait_healthy(tpu, zone=zone, project=project)

    if not yes:
        print_step('Step 1: delete TPU.', delete)
        print_step('Step 2: create TPU.', create)
        print_step('Step 3: wait until TPU is HEALTHY.', wait)
        if len(command) > 0:
            for i, cmd in enumerate(command):
                print_step('Step {}: run this command:'.format(i + 4), cmd)
        if not click.confirm(
                'Proceed? {}'.format('(dry run)' if dry_run else '')):
            return
    do_step('Step 1: delete TPU...', delete, dry_run=dry_run)
    do_step('Step 2: create TPU...', create, dry_run=dry_run)
    do_step('Step 3: wait for TPU to become HEALTHY...', wait, dry_run=dry_run)
    if len(command) > 0:
        for i, cmd in enumerate(command):
            do_step('Step {}: running command...'.format(i + 4),
                    cmd,
                    dry_run=dry_run)
    click.echo('TPU {} {} ready for training.'.format(
        tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is'))
Exemple #8
0
def reimage(tpu, zone, project, version, yes, dry_run):
    """Reimages the OS on a TPU."""
    tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project)
    reimage = tpunicorn.reimage_tpu_command(tpu,
                                            zone=zone,
                                            project=project,
                                            version=version)

    def wait():
        wait_healthy(tpu, zone=zone, project=project)

    if not yes:
        print_step('Step 1: reimage TPU.', reimage)
        print_step('Step 2: wait until TPU is HEALTHY.', wait)
        if not click.confirm(
                'Proceed? {}'.format('(dry run)' if dry_run else '')):
            return
    do_step('Step 1: reimage TPU...', reimage, dry_run=dry_run)
    do_step('Step 2: wait for TPU to become HEALTHY...', wait, dry_run=dry_run)
    click.echo('TPU {} {} ready for training.'.format(
        tpunicorn.tpu.parse_tpu_id(tpu), 'would be' if dry_run else 'is'))
Exemple #9
0
def recreate(tpu, zone, project, version, yes, dry_run, preempted, command, retry, retry_randomness, **kws):
  """
  Recreates a TPU, optionally switching the system software to the specified TF_VERSION.
  """
  tpu = tpunicorn.get_tpu(tpu=tpu, zone=zone, project=project)
  click.echo('Current status of TPU {} as of {}:'.format(tpunicorn.tpu.parse_tpu_id(tpu), tpunicorn.tpu.get_timestamp()))
  print_tpu_status_headers()
  print_tpu_status(tpu)
  if preempted and not is_preempted(tpu, zone=zone, project=project):
    return
  click.echo('')
  delete = tpunicorn.delete_tpu_command(tpu, zone=zone, project=project)
  create = tpunicorn.create_tpu_command(tpu, zone=zone, project=project, version=version)
  def wait():
    wait_healthy(tpu, zone=zone, project=project)
  if not yes:
    print_step('Step 1: delete TPU.', delete)
    print_step('Step 2: create TPU.', create)
    print_step('Step 3: wait until TPU is HEALTHY.', wait)
    if len(command) > 0:
      for i, cmd in enumerate(command):
        print_step('Step {}: run this command:'.format(i+4), cmd)
    if not click.confirm('Proceed? {}'.format('(dry run)' if dry_run else '')):
      return
  do_step('Step 1: delete TPU...', delete, dry_run=dry_run)
  while do_step('Step 2: create TPU...', create, dry_run=dry_run) != 0:
    if retry is None:
      click.echo('TPU {} failed to create (is the region out of capacity?)'.format(tpunicorn.tpu.parse_tpu_id(tpu)), err=True)
      break
    n = random.uniform(1, retry_randomness)
    click.echo('TPU {} failed to create; trying again in {} minutes...'.format(tpunicorn.tpu.parse_tpu_id(tpu),
                                                                               int((retry * n)//60)), err=True)
    time.sleep(retry * n)
  do_step('Step 3: wait for TPU to become HEALTHY...', wait, dry_run=dry_run)
  if len(command) > 0:
    for i, cmd in enumerate(command):
      do_step('Step {}: running command...'.format(i+4), cmd, dry_run=dry_run)
  click.echo('TPU {} {} ready for training.'.format(
    tpunicorn.tpu.parse_tpu_id(tpu),
    'would be' if dry_run else 'is'))
Exemple #10
0
 def top(tpus, zone, project, color, silent):
     """Repeatedly print info about TPUs."""
     format = 'text'
     while True:
         click.clear()
         if len(tpus) <= 0:
             print_tpus_status(zone=zone,
                               project=project,
                               format=format,
                               color=color)
         else:
             if format == 'text':
                 print_tpu_status_headers()
             for choice in tpus:
                 for tpu in tpunicorn.get_tpu(choice,
                                              zone=zone,
                                              project=project,
                                              silent=silent,
                                              multi=True):
                     if tpu is not None:
                         print_tpu_status(tpu, format=format, color=color)
         time.sleep(5.0)
Exemple #11
0
 def list_tpus(tpus, zone, project, format, color, tpu, silent):
     """List TPUs."""
     if len(tpu) > 0:
         logging.warn(
             '-t/--tpu argument is deprecated; you can just pass in the TPU IDs directly now. (in other words, just remove -t or --tpu from your script)'
         )
         tpus = list(tpus)
         tpus.extend(tpu)
     if len(tpus) <= 0:
         print_tpus_status(zone=zone,
                           project=project,
                           format=format,
                           color=color)
     else:
         if format == 'text':
             print_tpu_status_headers()
         for choice in tpus:
             for tpu in tpunicorn.get_tpu(choice,
                                          zone=zone,
                                          project=project,
                                          silent=silent,
                                          multi=True):
                 if tpu is not None:
                     print_tpu_status(tpu, format=format, color=color)
Exemple #12
0
def is_preempted(tpu, zone=None, project=None):
  tpu = tpunicorn.get_tpu(tpu, zone=zone, project=project)
  status = tpunicorn.format(tpu, '{status}')
  return status == 'PREEMPTED'