Example #1
0
    def test_sequential_experimental_runs(self):
        resolver = get_tpu_cluster_resolver()
        remote.connect_to_cluster(resolver)
        topology = tpu_strategy_util.initialize_tpu_system(resolver)
        # Computation replicated to all cores.
        device_assignment = device_assignment_lib.DeviceAssignment.build(
            topology, num_replicas=2)
        strategy = tpu_lib.TPUStrategy(resolver,
                                       device_assignment=device_assignment)

        # Computation on the 1st core.
        device_assignment2 = device_assignment_lib.DeviceAssignment.build(
            topology, num_replicas=1)
        strategy2 = tpu_lib.TPUStrategy(resolver,
                                        device_assignment=device_assignment2)

        def computation(x):
            return math_ops.square(x)

        @def_function.function
        def train_step():
            outputs = strategy.experimental_local_results(
                strategy.run(computation, args=([2., 2.], )))
            outputs2 = strategy2.run(computation, args=([outputs[0]], ))
            return outputs2

        self.assertAllEqual([[16., 16.]], train_step())
Example #2
0
    def test_worker_devices_on_subset_cores(self, enable_packed_var):
        resolver = get_tpu_cluster_resolver()
        remote.connect_to_cluster(resolver)
        topology = tpu_strategy_util.initialize_tpu_system(resolver)

        # Strategy for the 1st core.
        device_assignment = device_assignment_lib.DeviceAssignment.build(
            topology, num_replicas=1)
        first_core_strategy = tpu_lib.TPUStrategy(
            resolver, device_assignment=device_assignment)
        first_core_strategy._enable_packed_variable_in_eager_mode = (
            enable_packed_var)

        # Strategy for the 2nd core.
        device_assignment2 = device_assignment_lib.DeviceAssignment(
            topology, [[[0, 0, 0, 1]]])
        second_core_strategy = tpu_lib.TPUStrategy(
            resolver, device_assignment=device_assignment2)
        second_core_strategy._enable_packed_variable_in_eager_mode = (
            enable_packed_var)

        self.assertLen(first_core_strategy.extended.worker_devices, 1)
        self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
                            "device:TPU:0")

        self.assertLen(second_core_strategy.extended.worker_devices, 1)
        self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
                            "device:TPU:1")
Example #3
0
    def test_computation_on_subset_cores(self):
        resolver = get_tpu_cluster_resolver()
        remote.connect_to_cluster(resolver)
        topology = tpu_strategy_util.initialize_tpu_system(resolver)
        all_core_strategy = tpu_lib.TPUStrategy(resolver)

        with all_core_strategy.scope():
            v = variables.Variable(
                0.0, aggregation=variables.VariableAggregation.MEAN)

        # Computation on the 1st core.
        device_assignment = device_assignment_lib.DeviceAssignment.build(
            topology, num_replicas=1)
        first_core_strategy = tpu_lib.TPUStrategy(
            resolver, device_assignment=device_assignment)

        # Computation on the 2nd core.
        device_assignment2 = device_assignment_lib.DeviceAssignment(
            topology, [[[0, 0, 0, 1]]])
        second_core_strategy = tpu_lib.TPUStrategy(
            resolver, device_assignment=device_assignment2)

        @def_function.function
        def train_step():
            def step_fn():
                return v + 1.0

            all_core_strategy.run(step_fn)
            r1 = first_core_strategy.run(step_fn)
            r2 = second_core_strategy.run(step_fn)
            return r1 + r2

        train_step()
        self.assertAllEqual(2., train_step())
Example #4
0
  def setUp(self):
    super(TPUEmbeddingCheckpointTest, self).setUp()
    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
    remote.connect_to_cluster(self.resolver)
    tpu_strategy_util.initialize_tpu_system(self.resolver)
    self.strategy = tpu_strategy.TPUStrategy(self.resolver)
    self.num_rows = self.strategy.num_replicas_in_sync

    # These tests use two mid level API objects, initialized with different
    # values. These have the same sizes.
    with self.strategy.scope():
      self.first_mid_level_contents = np.ones((self.num_rows, 4))
      self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
          learning_rate=0.1)
      self.first_mid_level = self.build_mid_level(
          self.first_mid_level_contents, self.first_mid_level_optimizer)

      self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2
      self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
          learning_rate=0.1)
      self.second_mid_level = self.build_mid_level(
          self.second_mid_level_contents, self.second_mid_level_optimizer,
          initialize_tpu_embedding=False)

    self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
        learning_rate=0.1)
    self.cpu_mid_level = self.build_mid_level(
        self.second_mid_level_contents, self.cpu_mid_level_optimizer)
    def testSummaryWithCustomTrainingLoop(self):
        resolver = tpu_cluster_resolver.TPUClusterResolver('')
        tpu_strategy_util.initialize_tpu_system(resolver)
        strategy = tpu_strategy_lib.TPUStrategy(resolver)

        with strategy.scope():
            model = distribute_strategy_test.get_model()
            model.compile('sgd', 'mse')
            writer = summary_ops_v2.create_file_writer_v2(self.summary_dir)

            @def_function.function
            def custom_function(dataset):
                def _custom_step(features, labels):
                    del labels
                    logits = model(features)
                    with summary_ops_v2.always_record_summaries(
                    ), writer.as_default():
                        summary_ops_v2.scalar('logits',
                                              logits,
                                              step=model.optimizer.iterations)
                    return logits

                iterator = iter(dataset)
                output = strategy.unwrap(
                    strategy.run(_custom_step, args=(next(iterator))))
                return output

            dataset = strategy.experimental_distribute_dataset(
                distribute_strategy_test.get_dataset(strategy))

            custom_function(dataset)
    def testEagerTPUDistributionStrategy(self):
        self.skipTest("b/121387144")
        num_training_steps = 10
        checkpoint_directory = self.get_temp_dir()
        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")

        def _train_fn(optimizer, model):
            input_value = constant_op.constant([[3.]])
            optimizer.minimize(functools.partial(model, input_value),
                               global_step=root.optimizer_step)

        for training_continuation in range(3):
            strategy = tpu_strategy.TPUStrategy()
            with strategy.scope():
                model = Subclassed()
                optimizer = adam_v1.AdamOptimizer(0.001)
                root = trackable_utils.Checkpoint(
                    optimizer=optimizer,
                    model=model,
                    optimizer_step=training_util.get_or_create_global_step())
                root.restore(
                    checkpoint_management.latest_checkpoint(
                        checkpoint_directory))

                for _ in range(num_training_steps):
                    strategy.extended.call_for_each_replica(
                        functools.partial(_train_fn, optimizer, model))
                root.save(file_prefix=checkpoint_prefix)
                self.assertEqual(
                    (training_continuation + 1) * num_training_steps,
                    root.optimizer_step.numpy())
Example #7
0
    def _create_tpu_strategy():
        global _did_connect_to_cluster

        # These flags will be defined by tpu_test_wrapper.py.
        resolver = tpu_cluster_resolver.TPUClusterResolver(
            tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
            zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
            project=hasattr(FLAGS, "project") and FLAGS.project or None,
        )
        # Only connect once per process, rather than per test method.
        if hasattr(FLAGS, "tpu") and FLAGS.tpu and not _did_connect_to_cluster:
            remote.connect_to_cluster(resolver)
            _did_connect_to_cluster = True

        topology = tpu_strategy_util.initialize_tpu_system(resolver)
        device_assignment = None
        if use_single_core:
            device_assignment = device_assignment_lib.DeviceAssignment(
                topology,
                core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)

        # Steps per run is only supported in TF 1.x
        if tf2.enabled():
            return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
        else:
            return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
                                         device_assignment, **kwargs)
Example #8
0
def get_tpu_strategy(enable_packed_var=False):
    resolver = get_tpu_cluster_resolver()
    remote.connect_to_cluster(resolver)
    tpu_strategy_util.initialize_tpu_system(resolver)
    strategy = tpu_lib.TPUStrategy(resolver)
    strategy._enable_packed_variable_in_eager_mode = enable_packed_var
    return strategy
Example #9
0
 def _get_strategy(self):
   self.resolver = tpu_cluster_resolver.TPUClusterResolver(
       tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
   remote.connect_to_cluster(self.resolver)
   tpu_strategy_util.initialize_tpu_system(self.resolver)
   strategy = tpu_strategy.TPUStrategy(self.resolver)
   self.num_replicas = strategy.num_replicas_in_sync
   return strategy
 def _get_strategy(self):
     self.resolver = tpu_cluster_resolver.TPUClusterResolver(
         tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
     if hasattr(self.resolver, '_cloud_tpu_client'):
         self.resolver._cloud_tpu_client.configure_tpu_version(
             version='nightly', restart_type='always')
     remote.connect_to_cluster(self.resolver)
     tpu_strategy_util.initialize_tpu_system(self.resolver)
     return tpu_strategy.TPUStrategy(self.resolver)
Example #11
0
def get_tpu_strategy():
    resolver = tpu_cluster_resolver.TPUClusterResolver(
        tpu=FLAGS.tpu,
        zone=FLAGS.zone,
        project=FLAGS.project,
    )
    remote.connect_to_cluster(resolver)
    tpu_strategy_util.initialize_tpu_system(resolver)
    return tpu_lib.TPUStrategy(resolver)
Example #12
0
def get_strategy():
    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu="grpc://" +
                                                       os.environ["TPU_IP"])
    remote.connect_to_cluster(resolver)
    topology = tpu_strategy_util.initialize_tpu_system(resolver)
    print("Device coordinates: ", topology.device_coordinates)
    device_assignment = tf.python.tpu.device_assignment.DeviceAssignment.build(
        topology, computation_shape=[1, 1, 1, 1], num_replicas=1)

    return tpu_strategy.TPUStrategy(resolver,
                                    device_assignment=device_assignment)
Example #13
0
  def _create_tpu_strategy():
    resolver = tpu_cluster_resolver.TPUClusterResolver("")
    topology = tpu_strategy_util.initialize_tpu_system(resolver)
    device_assignment = None
    if use_single_core:
      device_assignment = device_assignment_lib.DeviceAssignment(
          topology, core_assignment=device_assignment_lib.
          SINGLE_CORE_ASSIGNMENT)

    strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run,
                                   device_assignment=device_assignment,
                                   **kwargs)
    return strategy
    def _create_tpu_strategy():
        resolver = tpu_cluster_resolver.TPUClusterResolver("")
        topology = tpu_strategy_util.initialize_tpu_system(resolver)
        device_assignment = None
        if use_single_core:
            device_assignment = device_assignment_lib.DeviceAssignment(
                topology,
                core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)

        # Steps per run is only supported in TF 1.x
        if tf2.enabled():
            return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
        else:
            return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
                                         device_assignment, **kwargs)
Example #15
0
    def _create_tpu_strategy():
        FLAGS = flags.FLAGS  # pylint: disable=invalid-name
        global _did_connect_to_cluster
        global _topology

        try:
            # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
            # which case we fall back to the values passed as flags.
            resolver = tpu_cluster_resolver.TPUClusterResolver()
            did_automatically_resolve = True
        except ValueError:
            did_automatically_resolve = False

            # These flags will be defined by tpu_test_wrapper.py.
            resolver = tpu_cluster_resolver.TPUClusterResolver(
                tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
                zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
                project=hasattr(FLAGS, "project") and FLAGS.project or None,
            )

        # Only connect once per process, rather than per test method.
        if not _did_connect_to_cluster:
            if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
                remote.connect_to_cluster(resolver)
                _did_connect_to_cluster = True
            _topology = tpu_strategy_util.initialize_tpu_system(resolver)

        device_assignment = None
        if use_single_core:
            device_assignment = device_assignment_lib.DeviceAssignment(
                _topology,
                core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)

        # Steps per run is only supported in TF 1.x
        if tf2.enabled():
            strategy = tpu_lib.TPUStrategy(resolver, device_assignment,
                                           **kwargs)
        else:
            strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run,
                                             device_assignment, **kwargs)
        strategy._enable_packed_variable_in_eager_mode = enable_packed_variable  # pylint: disable=protected-access
        return strategy
    def testV2SummaryWithKerasFit(self):
        resolver = tpu_cluster_resolver.TPUClusterResolver('')
        tpu_strategy_util.initialize_tpu_system(resolver)
        strategy = tpu_strategy_lib.TPUStrategy(resolver)

        with strategy.scope():
            model = CustomModel()
            model.compile('sgd', 'mse')

            dataset = distribute_strategy_test.get_dataset(strategy)
            tensorboard_callback = callbacks.TensorBoard(self.summary_dir,
                                                         update_freq=2)
            model.fit(dataset,
                      steps_per_epoch=10,
                      epochs=1,
                      callbacks=[tensorboard_callback])

            event_files = file_io.get_matching_files_v2(
                os.path.join(self.summary_dir, 'train', 'event*'))
            events_count_dictionary = {
                'custom_model/layer_for_scalar_summary/custom_scalar_summary':
                0,
                'custom_model/layer_for_histogram_summary/custom_histogram_summary':
                0
            }

            for event_file in event_files:
                for e in summary_iterator.summary_iterator(event_file):
                    for v in e.summary.value:
                        if v.tag in events_count_dictionary:
                            events_count_dictionary[v.tag] += 1

            # Since total of 10 steps are ran and summary ops should be invoked
            # every 2 batches, we should see total of 5 event logs.
            self.assertEqual(
                events_count_dictionary[(
                    'custom_model/layer_for_histogram_summary/'
                    'custom_histogram_summary')], 5)
            self.assertEqual(
                events_count_dictionary[
                    'custom_model/layer_for_scalar_summary/custom_scalar_summary'],
                5)
def get_tpu_strategy():
    resolver = get_tpu_cluster_resolver()
    remote.connect_to_cluster(resolver)
    tpu_strategy_util.initialize_tpu_system(resolver)
    return tpu_strategy_lib.TPUStrategy(resolver)
Example #18
0
 def test_cluster_resolver_available(self, enable_packed_var):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
     tpu_strategy_util.initialize_tpu_system(resolver)
     strategy = tpu_lib.TPUStrategy(resolver)
     self.assertIs(strategy.cluster_resolver, resolver)