Exemple #1
0
 def testTrainResults(self):
     samples = mnist_benchmark.MakeSamplesFromTrainOutput(
         self.metadata, self.contents, 0)
     for s in samples:
         print s
     golden = [
         Sample(
             'Loss', 5.7193503, '', {
                 'num_examples_per_epoch': 1251.1,
                 'epoch': 3.197186475901207,
                 'elapsed seconds': 0,
                 'step': 4000
             }),
         Sample(
             'Global Steps Per Second', 1.4384171428571428,
             'global_steps/sec', {
                 'num_examples_per_epoch': 1251.1,
                 'epoch': 3.197186475901207,
                 'elapsed seconds': 0,
                 'step': 4000
             }),
         Sample(
             'Examples Per Second', 1472.9414285714283, 'examples/sec', {
                 'num_examples_per_epoch': 1251.1,
                 'epoch': 3.197186475901207,
                 'elapsed seconds': 0,
                 'step': 4000
             })
     ]
     self.assertEqual(samples, golden)
 def testTrainResults(self):
     samples = mnist_benchmark.MakeSamplesFromTrainOutput(
         self.metadata_input, self.contents, 0)
     golden = [
         Sample('Loss', 0.09562386, '', self.metadata_output),
         Sample('Global Steps Per Second', 217.69966666666664,
                'global_steps/sec', self.metadata_output),
         Sample('Examples Per Second', 222924.33333333334, 'examples/sec',
                self.metadata_output)
     ]
     self.assertEqual(samples, golden)
Exemple #3
0
 def testTrainResults(self):
   samples = mnist_benchmark.MakeSamplesFromTrainOutput(
       self.metadata_input, self.contents, 0)
   golden = [
       Sample('Loss', 3.6859958, '', self.metadata_output),
       Sample('Global Steps Per Second', 3.6699466666666667,
              'global_steps/sec', self.metadata_output),
       Sample('Examples Per Second', 3758.023333333333,
              'examples/sec', self.metadata_output)
   ]
   self.assertEqual(samples, golden)
Exemple #4
0
 def testTrainResults(self):
   samples = mnist_benchmark.MakeSamplesFromTrainOutput(
       self.metadata_input, self.contents, 0)
   golden = [
       Sample('Loss', 5.7193503, '', self.metadata_output),
       Sample('Global Steps Per Second', 1.4384171428571428,
              'global_steps/sec', self.metadata_output),
       Sample('Examples Per Second', 1472.9414285714283,
              'examples/sec', self.metadata_output)
   ]
   self.assertEqual(samples, golden)
 def testTrainResults(self):
   samples = mnist_benchmark.MakeSamplesFromTrainOutput(
       self.metadata, self.contents, 0)
   golden = [
       Sample(
           'Loss', 0.09562386, '',
           {'num_examples_per_epoch': 1251.1, 'step': 2000,
            'elapsed seconds': 0, 'epoch': 1.5985932379506036}),
       Sample(
           'Global Steps Per Second', 217.69966666666664, 'global_steps/sec',
           {'num_examples_per_epoch': 1251.1, 'step': 2000,
            'elapsed seconds': 0, 'epoch': 1.5985932379506036}),
       Sample(
           'Examples Per Second', 222924.33333333334, 'examples/sec',
           {'num_examples_per_epoch': 1251.1, 'step': 2000,
            'elapsed seconds': 0, 'epoch': 1.5985932379506036})
   ]
   self.assertEqual(samples, golden)
 def testTrainResults(self):
   samples = mnist_benchmark.MakeSamplesFromTrainOutput(
       self.metadata, self.contents, 0)
   golden = [
       Sample(
           'Loss', 3.6859958, '',
           {'epoch': 4.000479577971386, 'elapsed seconds': 0,
            'num_examples_per_epoch': 1251.1, 'step': 5005}),
       Sample(
           'Global Steps Per Second', 3.6699466666666667, 'global_steps/sec',
           {'epoch': 4.000479577971386, 'elapsed seconds': 0,
            'num_examples_per_epoch': 1251.1, 'step': 5005}),
       Sample(
           'Examples Per Second', 3758.023333333333, 'examples/sec',
           {'epoch': 4.000479577971386, 'elapsed seconds': 0,
            'num_examples_per_epoch': 1251.1, 'step': 5005})
   ]
   self.assertEqual(samples, golden)
def Run(benchmark_spec):
  """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  resnet_benchmark_script = 'resnet_main.py'
  resnet_benchmark_cmd = (
      '{env_cmd} && cd tpu/models/official/resnet && '
      'python {script} '
      '--use_tpu={use_tpu} '
      '--data_dir={data_dir} '
      '--model_dir={model_dir} '
      '--resnet_depth={depth} '
      '--train_batch_size={train_batch_size} '
      '--eval_batch_size={eval_batch_size} '
      '--iterations_per_loop={iterations} '
      '--data_format={data_format} '
      '--precision={precision} '
      '--skip_host_call={skip_host_call} '
      '--num_train_images={num_train_images} '
      '--num_eval_images={num_eval_images}'.format(
          env_cmd=benchmark_spec.env_cmd,
          script=resnet_benchmark_script,
          use_tpu=bool(benchmark_spec.tpus),
          data_dir=benchmark_spec.data_dir,
          model_dir=benchmark_spec.model_dir,
          depth=benchmark_spec.depth,
          train_batch_size=benchmark_spec.train_batch_size,
          eval_batch_size=benchmark_spec.eval_batch_size,
          iterations=benchmark_spec.iterations,
          data_format=benchmark_spec.data_format,
          precision=benchmark_spec.precision,
          skip_host_call=benchmark_spec.skip_host_call,
          num_train_images=benchmark_spec.num_train_images,
          num_eval_images=benchmark_spec.num_eval_images
      ))
  if FLAGS.tf_device == 'gpu':
    resnet_benchmark_cmd = '{env} {cmd}'.format(
        env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd)
  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  elapsed_seconds = 0
  steps_per_eval = benchmark_spec.steps_per_eval
  train_steps = benchmark_spec.train_steps
  for step in range(steps_per_eval, train_steps + steps_per_eval,
                    steps_per_eval):
    step = min(step, train_steps)
    resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
        cmd=resnet_benchmark_cmd, step=step)
    if benchmark_spec.mode in ('train', 'train_and_eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['train'].GetNumShards())
      else:
        tpu = num_cores = ''
      resnet_benchmark_train_cmd = (
          '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
              cmd=resnet_benchmark_cmd_step,
              tpu=tpu, num_cores=num_cores))
      start = time.time()
      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd,
                                              should_log=True)
      elapsed_seconds += (time.time() - start)
      samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput(
          metadata, stdout + stderr, elapsed_seconds, step))
    if benchmark_spec.mode in ('train_and_eval', 'eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['eval'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['eval'].GetNumShards())
      else:
        tpu = num_cores = ''
      resnet_benchmark_eval_cmd = (
          '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
              cmd=resnet_benchmark_cmd_step,
              tpu=tpu, num_cores=num_cores))
      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd,
                                              should_log=True)
      samples.extend(MakeSamplesFromEvalOutput(
          metadata, stdout + stderr, elapsed_seconds))
  return samples
Exemple #8
0
def Run(benchmark_spec):
    """Run Inception V3 on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    inception3_benchmark_script = (
        'tpu/models/experimental/inception/inception_v3.py')
    inception3_benchmark_cmd = (
        '{env_cmd} && python {script} '
        '--learning_rate={learning_rate} '
        '--iterations={iterations} '
        '--use_tpu={use_tpu} '
        '--use_data={use_data} '
        '--train_steps_per_eval={steps_per_eval} '
        '--data_dir={data_dir} '
        '--model_dir={model_dir} '
        '--save_checkpoints_secs={save_checkpoints_secs} '
        '--train_batch_size={train_batch_size} '
        '--eval_batch_size={eval_batch_size} '
        '--precision={precision}'.format(
            env_cmd=benchmark_spec.env_cmd,
            script=inception3_benchmark_script,
            learning_rate=benchmark_spec.learning_rate,
            iterations=benchmark_spec.iterations,
            use_tpu=bool(benchmark_spec.tpus),
            use_data=benchmark_spec.use_data,
            steps_per_eval=benchmark_spec.steps_per_eval,
            data_dir=benchmark_spec.data_dir,
            model_dir=benchmark_spec.model_dir,
            save_checkpoints_secs=benchmark_spec.save_checkpoints_secs,
            train_batch_size=benchmark_spec.train_batch_size,
            eval_batch_size=benchmark_spec.eval_batch_size,
            precision=benchmark_spec.precision))
    if FLAGS.tf_device == 'gpu':
        inception3_benchmark_cmd = '{env} {cmd}'.format(
            env=tensorflow.GetEnvironmentVars(vm),
            cmd=inception3_benchmark_cmd)
    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    elapsed_seconds = 0
    steps_per_eval = benchmark_spec.steps_per_eval
    train_steps = benchmark_spec.train_steps
    for step in range(steps_per_eval, train_steps + steps_per_eval,
                      steps_per_eval):
        step = min(step, train_steps)
        inception3_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
            cmd=inception3_benchmark_cmd, step=step)
        if benchmark_spec.mode in ('train', 'train_and_eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['train'].GetName()
                num_shards = '--num_shards={}'.format(
                    benchmark_spec.tpu_groups['train'].GetNumShards())
            else:
                tpu = num_shards = ''
            inception3_benchmark_train_cmd = (
                '{cmd} --tpu={tpu} --mode=train {num_shards}'.format(
                    cmd=inception3_benchmark_cmd_step,
                    tpu=tpu,
                    num_shards=num_shards))
            start = time.time()
            stdout, stderr = vm.RobustRemoteCommand(
                inception3_benchmark_train_cmd, should_log=True)
            elapsed_seconds += (time.time() - start)
            samples.extend(
                mnist_benchmark.MakeSamplesFromTrainOutput(
                    metadata, stdout + stderr, elapsed_seconds, step))
        if benchmark_spec.mode in ('train_and_eval', 'eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['eval'].GetName()
                num_shards = '--num_shards={}'.format(
                    benchmark_spec.tpu_groups['eval'].GetNumShards())
            else:
                tpu = num_shards = ''
            inception3_benchmark_eval_cmd = (
                '{cmd} --tpu={tpu} --mode=eval {num_shards}'.format(
                    cmd=inception3_benchmark_cmd_step,
                    tpu=tpu,
                    num_shards=num_shards))
            stdout, stderr = vm.RobustRemoteCommand(
                inception3_benchmark_eval_cmd, should_log=True)
            samples.extend(
                resnet_benchmark.MakeSamplesFromEvalOutput(
                    metadata, stdout + stderr, elapsed_seconds))
    return samples
def Run(benchmark_spec):
  """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  if benchmark_spec.tpus:
    resnet_benchmark_script = 'resnet_main.py'
    resnet_benchmark_cmd = (
        '{env_cmd} && '
        'cd tpu/models && '
        'export PYTHONPATH=$(pwd) &&'
        'cd official/resnet && '
        'python {script} '
        '--use_tpu={use_tpu} '
        '--data_dir={data_dir} '
        '--model_dir={model_dir} '
        '--resnet_depth={depth} '
        '--train_batch_size={train_batch_size} '
        '--eval_batch_size={eval_batch_size} '
        '--iterations_per_loop={iterations} '
        '--data_format={data_format} '
        '--precision={precision} '
        '--skip_host_call={skip_host_call} '
        '--num_train_images={num_train_images} '
        '--num_eval_images={num_eval_images}'.format(
            env_cmd=benchmark_spec.env_cmd,
            script=resnet_benchmark_script,
            use_tpu=bool(benchmark_spec.tpus),
            data_dir=benchmark_spec.data_dir,
            model_dir=benchmark_spec.model_dir,
            depth=benchmark_spec.depth,
            train_batch_size=benchmark_spec.train_batch_size,
            eval_batch_size=benchmark_spec.eval_batch_size,
            iterations=benchmark_spec.iterations,
            data_format=benchmark_spec.data_format,
            precision=benchmark_spec.precision,
            skip_host_call=benchmark_spec.skip_host_call,
            num_train_images=benchmark_spec.num_train_images,
            num_eval_images=benchmark_spec.num_eval_images))
  else:
    resnet_benchmark_script = 'imagenet_main.py'
    resnet_benchmark_cmd = ('{env_cmd} && '
                            'cd models && '
                            'export PYTHONPATH=$(pwd) && '
                            'cd official/r1/resnet && '
                            'python {script} '
                            '--data_dir=/data/imagenet '
                            '--model_dir={model_dir} '
                            '--resnet_size={resnet_size} '
                            '--batch_size={batch_size} '
                            '--data_format={data_format} '.format(
                                env_cmd=benchmark_spec.env_cmd,
                                script=resnet_benchmark_script,
                                model_dir=benchmark_spec.model_dir,
                                resnet_size=benchmark_spec.depth,
                                batch_size=benchmark_spec.train_batch_size,
                                data_format=benchmark_spec.data_format))
    precision = '{precision}'.format(precision=benchmark_spec.precision)
    if precision == 'bfloat16':
      resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
          cmd=resnet_benchmark_cmd)
    else:
      resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
          cmd=resnet_benchmark_cmd)

    if nvidia_driver.CheckNvidiaGpuExists(vm):
      resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
          env=tensorflow.GetEnvironmentVars(vm),
          cmd=resnet_benchmark_cmd,
          num_gpus=nvidia_driver.QueryNumberOfGpus(vm))

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  elapsed_seconds = 0
  steps_per_eval = benchmark_spec.steps_per_eval
  train_steps = benchmark_spec.train_steps
  for step in range(steps_per_eval, train_steps + steps_per_eval,
                    steps_per_eval):
    step = min(step, train_steps)
    resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
        cmd=resnet_benchmark_cmd, step=step)

    if benchmark_spec.mode in ('train', 'train_and_eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['train'].GetNumShards())
        resnet_benchmark_train_cmd = (
            '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores))
      else:
        resnet_benchmark_train_cmd = (
            '{cmd} --max_train_steps={max_train_steps} '
            '--train_epochs={train_epochs} --noeval_only'.format(
                cmd=resnet_benchmark_cmd,
                train_epochs=benchmark_spec.epochs_per_eval,
                max_train_steps=step))

      start = time.time()
      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd,
                                              should_log=True)
      elapsed_seconds += (time.time() - start)
      samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput(
          metadata, stdout + stderr, elapsed_seconds, step))

    if benchmark_spec.mode in ('train_and_eval', 'eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['eval'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['eval'].GetNumShards())
        resnet_benchmark_eval_cmd = (
            '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores))
      else:
        resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format(
            cmd=resnet_benchmark_cmd))

      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd,
                                              should_log=True)
      samples.extend(
          MakeSamplesFromEvalOutput(
              metadata,
              stdout + stderr,
              elapsed_seconds,
              use_tpu=bool(benchmark_spec.tpus)))
  return samples