def _create_tpu_strategy(): resolver = cluster_resolver.TPUClusterResolver("") tpu_lib.initialize_tpu_system(resolver) strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run, **kwargs) return strategy
def testEagerTPUDistributionStrategy(self): self.skipTest("b/121387144") num_training_steps = 10 checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") def _train_fn(optimizer, model): input_value = constant_op.constant([[3.]]) optimizer.minimize( functools.partial(model, input_value), global_step=root.optimizer_step) for training_continuation in range(3): strategy = tpu_strategy.TPUStrategy() with strategy.scope(): model = Subclassed() optimizer = adam_v1.AdamOptimizer(0.001) root = checkpointable_utils.Checkpoint( optimizer=optimizer, model=model, optimizer_step=training_util.get_or_create_global_step()) root.restore(checkpoint_management.latest_checkpoint( checkpoint_directory)) for _ in range(num_training_steps): strategy.extended.call_for_each_replica( functools.partial(_train_fn, optimizer, model)) root.save(file_prefix=checkpoint_prefix) self.assertEqual((training_continuation + 1) * num_training_steps, root.optimizer_step.numpy())
def TPUDistributionStrategy(tpu_cluster_resolver=None): # pylint: disable=invalid-name """Construct a TPUDistributionStrategy.""" from tensorflow.contrib.distribute.python import tpu_strategy # pylint: disable=g-import-not-at-top # TODO -- remove this when TPUStrategy API is consistent (b/112705069) if tpu_cluster_resolver is None: tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('') args, _, _, _ = tf_inspect.getargspec(tpu_strategy.TPUStrategy.__init__) if len(args) == 3: logging.info('Detected new TPUStrategy API.') return tpu_strategy.TPUStrategy(tpu_cluster_resolver, steps_per_run=1) else: logging.info('Detected old TPUStrategy API.') strategy = tpu_strategy.TPUStrategy(num_cores_per_host=8) strategy._tpu_cluster_resolver = tpu_cluster_resolver return strategy
def main(argv): logging.info('Building Keras ResNet-50 model.') model = tf.keras.applications.resnet50.ResNet50(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=NUM_CLASSES) num_cores = 8 batch_size = PER_CORE_BATCH_SIZE * num_cores if FLAGS.use_tpu: resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=100) else: strategy = None logging.info('Compiling model.') model.compile( optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'], distribute=strategy) # TODO(sourabhbajaj): Add support for synthetic dataset. if FLAGS.data_dir is None: raise ValueError('data_dir must be provided to train the model.') imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput(is_training=is_training, data_dir=FLAGS.data_dir, per_core_batch_size=PER_CORE_BATCH_SIZE) for is_training in [True, False] ] logging.info('Training model using real data in directory "%s".', FLAGS.data_dir) num_epochs = 90 # Standard imagenet training regime. model.fit(imagenet_train.input_fn(), epochs=num_epochs, steps_per_epoch=int(APPROX_IMAGENET_TRAINING_IMAGES / batch_size)) if HAS_H5PY: weights_path = os.path.join(FLAGS.model_dir, WEIGHTS_TXT) logging.info('Save weights into %s', weights_path) model.save_weights(weights_path, overwrite=True) logging.info('Evaluating the model on the validation dataset.') score = model.evaluate(imagenet_eval.input_fn(), steps=int(APPROX_IMAGENET_TEST_IMAGES // batch_size), verbose=1) logging.info('Evaluation score: %s', score)
def _create_tpu_strategy(): resolver = cluster_resolver.TPUClusterResolver("") topology = tpu_lib.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib. SINGLE_CORE_ASSIGNMENT) strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run, device_assignment=device_assignment, **kwargs) return strategy
def TPUDistributionStrategy(tpu_cluster_resolver=None, num_cores=None): # pylint: disable=invalid-name """Construct a TPUDistributionStrategy.""" from tensorflow.contrib.distribute.python import tpu_strategy # pylint: disable=g-import-not-at-top # TODO(b/112705069): Remove this when TPUStrategy API is consistent. # We are including this for (a) backwards compatibility for open sourced # releases of TensorFlow and (b) to work around a circular dependency # where keras_support and tpu_strategy depends on each other. Once we release # a final version and remove support for the old API, this will be deleted. # (See bug above for more details) if tpu_cluster_resolver is None: tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('') args, _, _, _ = tf_inspect.getargspec(tpu_strategy.TPUStrategy.__init__) if len(args) == 4: logging.info('Detected new TPUStrategy API.') return tpu_strategy.TPUStrategy(tpu_cluster_resolver, steps_per_run=1, num_cores=num_cores) else: logging.info('Detected old TPUStrategy API.') strategy = tpu_strategy.TPUStrategy(num_cores_per_host=8) strategy._tpu_cluster_resolver = tpu_cluster_resolver return strategy
@property def required_tpu(self): return self._required_tpu # pylint: disable=g-long-lambda default_strategy = NamedDistribution( "Default", distribution_strategy_context._get_default_distribution_strategy, # pylint: disable=protected-access required_gpus=None) one_device_strategy = NamedDistribution( "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"), required_gpus=None) tpu_strategy = NamedDistribution( "TPU", lambda: tpu_lib.TPUStrategy( TPUClusterResolver(""), steps_per_run=2), required_tpu=True) tpu_strategy_one_step = NamedDistribution( "TPUOneStep", lambda: tpu_lib.TPUStrategy( TPUClusterResolver(""), steps_per_run=1), required_tpu=True) mirrored_strategy_with_one_cpu = NamedDistribution( "Mirrored1CPU", lambda: mirrored_lib.MirroredStrategy(["/cpu:0"])) mirrored_strategy_with_one_gpu = NamedDistribution( "Mirrored1GPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]), required_gpus=1) mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
def required_tpu(self): return self._required_tpu # pylint: disable=g-long-lambda default_strategy = NamedDistribution( "Default", distribution_strategy_context._get_default_distribution_strategy, # pylint: disable=protected-access required_gpus=None) one_device_strategy = NamedDistribution( "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"), required_gpus=None) tpu_strategy = NamedDistribution( "TPU", lambda: tpu_lib.TPUStrategy(TPUClusterResolver(""), steps_per_run=5), required_tpu=True) # Note that we disable prefetching for testing since prefetching makes # the input non-deterministic. mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"], prefetch_on_device=False), required_gpus=1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"], prefetch_on_device=False), required_gpus=2) adam_optimizer_v1_fn = NamedObject("AdamV1",
def main(unused_argv): """Starts a ResNet training session.""" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator looks at the master it connects to for MonitoredTrainingSession # by reading the `TF_CONFIG` environment variable. tf_config_env = { 'session_master': tpu_cluster_resolver.get_master(), 'eval_session_master': tpu_cluster_resolver.get_master() } if tpu_cluster_resolver.cluster_spec(): tf_config_env['cluster'] = tpu_cluster_resolver.cluster_spec().as_dict() tf_config_env['task'] = {'type': 'worker', 'index': 0} os.environ['TF_CONFIG'] = json.dumps(tf_config_env) steps_per_run_train = _NUM_TRAIN_IMAGES // ( FLAGS.train_batch_size * FLAGS.num_cores) steps_per_run_eval = _NUM_EVAL_IMAGES // ( FLAGS.eval_batch_size * FLAGS.num_cores) steps_per_eval = steps_per_run_train train_distribution = tpu_lib.TPUStrategy(tpu_cluster_resolver, steps_per_run=steps_per_run_train) eval_distribution = tpu_lib.TPUStrategy(tpu_cluster_resolver, steps_per_run=steps_per_run_eval) config = tf.estimator.RunConfig( model_dir=FLAGS.model_dir, train_distribute=train_distribution, eval_distribute=eval_distribution, save_checkpoints_steps=steps_per_eval, save_checkpoints_secs=None, keep_checkpoint_max=1000) resnet_estimator = tf.estimator.Estimator( model_fn=model_fn, config=config) train_input, eval_input = [imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=True, use_bfloat16=True) for is_training in [True, False]] try: current_step = resnet_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP) except ValueError: current_step = 0 while current_step < _TRAIN_STEPS: next_checkpoint = min(current_step + steps_per_eval, _TRAIN_STEPS) resnet_estimator.train( input_fn=lambda: train_input.input_fn( # pylint: disable=g-long-lambda {'batch_size': FLAGS.train_batch_size}), max_steps=next_checkpoint) current_step = next_checkpoint eval_results = resnet_estimator.evaluate( input_fn=lambda: eval_input.input_fn( # pylint: disable=g-long-lambda {'batch_size': FLAGS.eval_batch_size}), steps=_NUM_EVAL_IMAGES // (FLAGS.eval_batch_size * FLAGS.num_cores)) tf.logging.info('Eval results: %s' % eval_results)
@property def required_tpu(self): return self._required_tpu default_strategy = NamedDistribution( "Default", distribute_lib._default_distribution_strategy, # pylint: disable=protected-access required_gpus=None) one_device_strategy = NamedDistribution( "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"), required_gpus=None) tpu_strategy_single_iteration = NamedDistribution( "TPUSingleIteration", tpu_strategy.TPUStrategy(iterations_per_step=1), required_tpu=True) tpu_strategy = NamedDistribution("TPU", tpu_strategy.TPUStrategy(), required_tpu=True) # Note that we disable prefetching for testing since prefetching makes # the input non-deterministic. mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"], prefetch_on_device=False), required_gpus=1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"], prefetch_on_device=False),
def TPUDistributionStrategy(*args, **kw): # pylint: disable=invalid-name from tensorflow.contrib.distribute.python import tpu_strategy # pylint: disable=g-import-not-at-top return tpu_strategy.TPUStrategy(*args, **kw)
def required_tpu(self): return self._required_tpu # pylint: disable=g-long-lambda default_strategy = NamedDistribution( "Default", lambda: distribute_lib._default_distribution_strategy, # pylint: disable=protected-access required_gpus=None) one_device_strategy = NamedDistribution( "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"), required_gpus=None) tpu_strategy = NamedDistribution( "TPU", lambda: tpu_lib.TPUStrategy(TPUClusterResolver("")), required_tpu=True) # Note that we disable prefetching for testing since prefetching makes # the input non-deterministic. mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"], prefetch_on_device=False), required_gpus=1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"], prefetch_on_device=False), required_gpus=2) multi_worker_strategy_with_cpu = NamedDistribution(
@property def required_tpu(self): return self._required_tpu default_strategy = NamedDistribution( "Default", distribute_lib._default_distribution_strategy, # pylint: disable=protected-access required_gpus=None) one_device_strategy = NamedDistribution( "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"), required_gpus=None) tpu_strategy_single_iteration = NamedDistribution( "TPUSingleIteration", tpu_strategy.TPUStrategy(iterations_per_step=1), required_tpu=True) tpu_strategy = NamedDistribution( "TPU", tpu_strategy.TPUStrategy(), required_tpu=True) # Note that we disable prefetching for testing since prefetching makes # the input non-deterministic. mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", mirrored_strategy.MirroredStrategy( ["/gpu:0", "/cpu:0"], prefetch_on_device=False), required_gpus=1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", mirrored_strategy.MirroredStrategy( ["/gpu:0", "/gpu:1"], prefetch_on_device=False), required_gpus=2)
def required_tpu(self): return self._required_tpu # pylint: disable=g-long-lambda default_strategy = NamedDistribution( "Default", lambda: distribute_lib._default_distribution_strategy, # pylint: disable=protected-access required_gpus=None) one_device_strategy = NamedDistribution( "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"), required_gpus=None) tpu_strategy_single_iteration = NamedDistribution( "TPUSingleIteration", lambda: tpu_lib.TPUStrategy(iterations_per_step=1), required_tpu=True) tpu_strategy = NamedDistribution("TPU", tpu_lib.TPUStrategy, required_tpu=True) # Note that we disable prefetching for testing since prefetching makes # the input non-deterministic. mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"], prefetch_on_device=False), required_gpus=1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"], prefetch_on_device=False), required_gpus=2)
def strategy(self): return self._distribution @property def required_gpus(self): return self._required_gpus @property def required_tpu(self): return self._required_tpu one_device_strategy = NamedDistribution( "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"), None) tpu_strategy = NamedDistribution("TPU", tpu_strategy.TPUStrategy(), required_tpu=True) mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1) mirrored_strategy_without_prefetch = NamedDistribution( "MirroredCPUAndGPUNoPrefetch", mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"], prefetch_on_device=False), 1) mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]), 2) adam_optimizer_v1_fn = NamedObject("AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1)) gradient_descent_optimizer_v1_fn = NamedObject(