Ejemplo n.º 1
0
 def _create_tpu_strategy():
     resolver = cluster_resolver.TPUClusterResolver("")
     tpu_lib.initialize_tpu_system(resolver)
     strategy = tpu_lib.TPUStrategy(resolver,
                                    steps_per_run=steps_per_run,
                                    **kwargs)
     return strategy
  def testEagerTPUDistributionStrategy(self):
    self.skipTest("b/121387144")
    num_training_steps = 10
    checkpoint_directory = self.get_temp_dir()
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")

    def _train_fn(optimizer, model):
      input_value = constant_op.constant([[3.]])
      optimizer.minimize(
          functools.partial(model, input_value),
          global_step=root.optimizer_step)

    for training_continuation in range(3):
      strategy = tpu_strategy.TPUStrategy()
      with strategy.scope():
        model = Subclassed()
        optimizer = adam_v1.AdamOptimizer(0.001)
        root = checkpointable_utils.Checkpoint(
            optimizer=optimizer, model=model,
            optimizer_step=training_util.get_or_create_global_step())
        root.restore(checkpoint_management.latest_checkpoint(
            checkpoint_directory))

        for _ in range(num_training_steps):
          strategy.extended.call_for_each_replica(
              functools.partial(_train_fn, optimizer, model))
        root.save(file_prefix=checkpoint_prefix)
        self.assertEqual((training_continuation + 1) * num_training_steps,
                         root.optimizer_step.numpy())
Ejemplo n.º 3
0
def TPUDistributionStrategy(tpu_cluster_resolver=None):  # pylint: disable=invalid-name
    """Construct a TPUDistributionStrategy."""
    from tensorflow.contrib.distribute.python import tpu_strategy  # pylint: disable=g-import-not-at-top
    # TODO -- remove this when TPUStrategy API is consistent (b/112705069)
    if tpu_cluster_resolver is None:
        tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('')

    args, _, _, _ = tf_inspect.getargspec(tpu_strategy.TPUStrategy.__init__)
    if len(args) == 3:
        logging.info('Detected new TPUStrategy API.')
        return tpu_strategy.TPUStrategy(tpu_cluster_resolver, steps_per_run=1)
    else:
        logging.info('Detected old TPUStrategy API.')
        strategy = tpu_strategy.TPUStrategy(num_cores_per_host=8)
        strategy._tpu_cluster_resolver = tpu_cluster_resolver

    return strategy
Ejemplo n.º 4
0
def main(argv):
    logging.info('Building Keras ResNet-50 model.')
    model = tf.keras.applications.resnet50.ResNet50(include_top=True,
                                                    weights=None,
                                                    input_tensor=None,
                                                    input_shape=None,
                                                    pooling=None,
                                                    classes=NUM_CLASSES)

    num_cores = 8
    batch_size = PER_CORE_BATCH_SIZE * num_cores

    if FLAGS.use_tpu:
        resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=100)
    else:
        strategy = None

    logging.info('Compiling model.')
    model.compile(
        optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'],
        distribute=strategy)

    # TODO(sourabhbajaj): Add support for synthetic dataset.
    if FLAGS.data_dir is None:
        raise ValueError('data_dir must be provided to train the model.')

    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(is_training=is_training,
                                     data_dir=FLAGS.data_dir,
                                     per_core_batch_size=PER_CORE_BATCH_SIZE)
        for is_training in [True, False]
    ]
    logging.info('Training model using real data in directory "%s".',
                 FLAGS.data_dir)
    num_epochs = 90  # Standard imagenet training regime.
    model.fit(imagenet_train.input_fn(),
              epochs=num_epochs,
              steps_per_epoch=int(APPROX_IMAGENET_TRAINING_IMAGES /
                                  batch_size))

    if HAS_H5PY:
        weights_path = os.path.join(FLAGS.model_dir, WEIGHTS_TXT)
        logging.info('Save weights into %s', weights_path)
        model.save_weights(weights_path, overwrite=True)

    logging.info('Evaluating the model on the validation dataset.')
    score = model.evaluate(imagenet_eval.input_fn(),
                           steps=int(APPROX_IMAGENET_TEST_IMAGES //
                                     batch_size),
                           verbose=1)
    logging.info('Evaluation score: %s', score)
Ejemplo n.º 5
0
  def _create_tpu_strategy():
    resolver = cluster_resolver.TPUClusterResolver("")
    topology = tpu_lib.initialize_tpu_system(resolver)
    device_assignment = None
    if use_single_core:
      device_assignment = device_assignment_lib.DeviceAssignment(
          topology, core_assignment=device_assignment_lib.
          SINGLE_CORE_ASSIGNMENT)

    strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run,
                                   device_assignment=device_assignment,
                                   **kwargs)
    return strategy
Ejemplo n.º 6
0
def TPUDistributionStrategy(tpu_cluster_resolver=None, num_cores=None):  # pylint: disable=invalid-name
    """Construct a TPUDistributionStrategy."""
    from tensorflow.contrib.distribute.python import tpu_strategy  # pylint: disable=g-import-not-at-top
    # TODO(b/112705069): Remove this when TPUStrategy API is consistent.
    # We are including this for (a) backwards compatibility for open sourced
    # releases of TensorFlow and (b) to work around a circular dependency
    # where keras_support and tpu_strategy depends on each other. Once we release
    # a final version and remove support for the old API, this will be deleted.
    # (See bug above for more details)
    if tpu_cluster_resolver is None:
        tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('')

    args, _, _, _ = tf_inspect.getargspec(tpu_strategy.TPUStrategy.__init__)
    if len(args) == 4:
        logging.info('Detected new TPUStrategy API.')
        return tpu_strategy.TPUStrategy(tpu_cluster_resolver,
                                        steps_per_run=1,
                                        num_cores=num_cores)
    else:
        logging.info('Detected old TPUStrategy API.')
        strategy = tpu_strategy.TPUStrategy(num_cores_per_host=8)
        strategy._tpu_cluster_resolver = tpu_cluster_resolver

    return strategy
Ejemplo n.º 7
0
  @property
  def required_tpu(self):
    return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy = NamedDistribution(
    "TPU", lambda: tpu_lib.TPUStrategy(
        TPUClusterResolver(""), steps_per_run=2),
    required_tpu=True)
tpu_strategy_one_step = NamedDistribution(
    "TPUOneStep", lambda: tpu_lib.TPUStrategy(
        TPUClusterResolver(""), steps_per_run=1),
    required_tpu=True)
mirrored_strategy_with_one_cpu = NamedDistribution(
    "Mirrored1CPU",
    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
mirrored_strategy_with_one_gpu = NamedDistribution(
    "Mirrored1GPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
    required_gpus=1)
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
Ejemplo n.º 8
0
    def required_tpu(self):
        return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU",
    lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy = NamedDistribution(
    "TPU",
    lambda: tpu_lib.TPUStrategy(TPUClusterResolver(""), steps_per_run=5),
    required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                          prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"],
                                          prefetch_on_device=False),
    required_gpus=2)

adam_optimizer_v1_fn = NamedObject("AdamV1",
Ejemplo n.º 9
0
def main(unused_argv):
  """Starts a ResNet training session."""
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu,
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)

  # Estimator looks at the master it connects to for MonitoredTrainingSession
  # by reading the `TF_CONFIG` environment variable.
  tf_config_env = {
      'session_master': tpu_cluster_resolver.get_master(),
      'eval_session_master': tpu_cluster_resolver.get_master()
  }
  if tpu_cluster_resolver.cluster_spec():
    tf_config_env['cluster'] = tpu_cluster_resolver.cluster_spec().as_dict()
    tf_config_env['task'] = {'type': 'worker', 'index': 0}
  os.environ['TF_CONFIG'] = json.dumps(tf_config_env)

  steps_per_run_train = _NUM_TRAIN_IMAGES // (
      FLAGS.train_batch_size * FLAGS.num_cores)
  steps_per_run_eval = _NUM_EVAL_IMAGES // (
      FLAGS.eval_batch_size * FLAGS.num_cores)
  steps_per_eval = steps_per_run_train

  train_distribution = tpu_lib.TPUStrategy(tpu_cluster_resolver,
                                           steps_per_run=steps_per_run_train)
  eval_distribution = tpu_lib.TPUStrategy(tpu_cluster_resolver,
                                          steps_per_run=steps_per_run_eval)
  config = tf.estimator.RunConfig(
      model_dir=FLAGS.model_dir,
      train_distribute=train_distribution,
      eval_distribute=eval_distribution,
      save_checkpoints_steps=steps_per_eval,
      save_checkpoints_secs=None,
      keep_checkpoint_max=1000)

  resnet_estimator = tf.estimator.Estimator(
      model_fn=model_fn, config=config)

  train_input, eval_input = [imagenet_input.ImageNetInput(
      is_training=is_training,
      data_dir=FLAGS.data_dir,
      transpose_input=True,
      use_bfloat16=True) for is_training in [True, False]]

  try:
    current_step = resnet_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP)
  except ValueError:
    current_step = 0

  while current_step < _TRAIN_STEPS:
    next_checkpoint = min(current_step + steps_per_eval, _TRAIN_STEPS)

    resnet_estimator.train(
        input_fn=lambda: train_input.input_fn(  # pylint: disable=g-long-lambda
            {'batch_size': FLAGS.train_batch_size}),
        max_steps=next_checkpoint)
    current_step = next_checkpoint

    eval_results = resnet_estimator.evaluate(
        input_fn=lambda: eval_input.input_fn(  # pylint: disable=g-long-lambda
            {'batch_size': FLAGS.eval_batch_size}),
        steps=_NUM_EVAL_IMAGES // (FLAGS.eval_batch_size * FLAGS.num_cores))

    tf.logging.info('Eval results: %s' % eval_results)
Ejemplo n.º 10
0
    @property
    def required_tpu(self):
        return self._required_tpu


default_strategy = NamedDistribution(
    "Default",
    distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU",
    one_device_strategy.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy_single_iteration = NamedDistribution(
    "TPUSingleIteration",
    tpu_strategy.TPUStrategy(iterations_per_step=1),
    required_tpu=True)
tpu_strategy = NamedDistribution("TPU",
                                 tpu_strategy.TPUStrategy(),
                                 required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                       prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"],
                                       prefetch_on_device=False),
Ejemplo n.º 11
0
def TPUDistributionStrategy(*args, **kw):  # pylint: disable=invalid-name
    from tensorflow.contrib.distribute.python import tpu_strategy  # pylint: disable=g-import-not-at-top
    return tpu_strategy.TPUStrategy(*args, **kw)
Ejemplo n.º 12
0
    def required_tpu(self):
        return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU",
    lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy = NamedDistribution(
    "TPU",
    lambda: tpu_lib.TPUStrategy(TPUClusterResolver("")),
    required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                          prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"],
                                          prefetch_on_device=False),
    required_gpus=2)

multi_worker_strategy_with_cpu = NamedDistribution(
Ejemplo n.º 13
0
  @property
  def required_tpu(self):
    return self._required_tpu


default_strategy = NamedDistribution(
    "Default",
    distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy_single_iteration = NamedDistribution(
    "TPUSingleIteration",
    tpu_strategy.TPUStrategy(iterations_per_step=1),
    required_tpu=True)
tpu_strategy = NamedDistribution(
    "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    mirrored_strategy.MirroredStrategy(
        ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    mirrored_strategy.MirroredStrategy(
        ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
    required_gpus=2)
Ejemplo n.º 14
0
    def required_tpu(self):
        return self._required_tpu


# pylint: disable=g-long-lambda
default_strategy = NamedDistribution(
    "Default",
    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
    required_gpus=None)
one_device_strategy = NamedDistribution(
    "OneDeviceCPU",
    lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
    required_gpus=None)
tpu_strategy_single_iteration = NamedDistribution(
    "TPUSingleIteration",
    lambda: tpu_lib.TPUStrategy(iterations_per_step=1),
    required_tpu=True)
tpu_strategy = NamedDistribution("TPU", tpu_lib.TPUStrategy, required_tpu=True)
# Note that we disable prefetching for testing since prefetching makes
# the input non-deterministic.
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                          prefetch_on_device=False),
    required_gpus=1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"],
                                          prefetch_on_device=False),
    required_gpus=2)
Ejemplo n.º 15
0
    def strategy(self):
        return self._distribution

    @property
    def required_gpus(self):
        return self._required_gpus

    @property
    def required_tpu(self):
        return self._required_tpu


one_device_strategy = NamedDistribution(
    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"), None)
tpu_strategy = NamedDistribution("TPU",
                                 tpu_strategy.TPUStrategy(),
                                 required_tpu=True)
mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
    "MirroredCPUAndGPU",
    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
mirrored_strategy_without_prefetch = NamedDistribution(
    "MirroredCPUAndGPUNoPrefetch",
    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"],
                                       prefetch_on_device=False), 1)
mirrored_strategy_with_two_gpus = NamedDistribution(
    "Mirrored2GPUs", mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]),
    2)

adam_optimizer_v1_fn = NamedObject("AdamV1",
                                   lambda: adam.AdamOptimizer(0.2, epsilon=1))
gradient_descent_optimizer_v1_fn = NamedObject(