Python TPUClusterResolver Examples, tensorflow.python.distribute.cluster_resolver.TPUClusterResolver Python Examples

Example #1

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testSimpleSuccessfulRetrieval(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'ipAddress': '10.1.2.3',
            'port': '8470',
            'health': 'HEALTHY'
        }
    }

    resolver = TPUClusterResolver(
        project='test-project',
        zone='us-central1-c',
        tpu=['test-tpu-1'],
        coordinator_name='coordinator',
        coordinator_address='10.128.1.5:10203',
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map))

    actual_cluster_spec = resolver.cluster_spec()
    expected_proto = """
    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
    """
    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')

Example #2

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testRetrieveProjectAndZoneFromMetadata(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'ipAddress': '10.1.2.3',
            'port': '8470',
            'health': 'HEALTHY'
        }
    }

    resolver = TPUClusterResolver(
        project=None,
        zone=None,
        tpu=['test-tpu-1'],
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map),
        coordinator_name='coordinator')

    actual_cluster_spec = resolver.cluster_spec()
    expected_proto = """
    job {
      name: 'coordinator'
      tasks { key: 0 value: '10.128.1.2:%s' }
    }
    job {
      name: 'worker'
      tasks { key: 0 value: '10.1.2.3:8470' }
    }
    """ % resolver._coordinator_port
    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')

Example #3

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testNewNetworkEndpointFormat(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'health': 'HEALTHY',
            'networkEndpoints': [{
                'ipAddress': '10.2.3.4',
                'port': 8470,
            }]
        }
    }

    resolver = TPUClusterResolver(
        project='test-project',
        zone='us-central1-c',
        tpu='test-tpu-1',
        coordinator_name='coordinator',
        coordinator_address='10.128.1.5:10203',
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map))

    actual_cluster_spec = resolver.cluster_spec()
    expected_proto = """
    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
    job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } }
    """
    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
    self.assertEqual('grpc://10.2.3.4:8470', resolver.master())

Example #4

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testGkeEnvironmentForPod(self):
    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,'
                                                     'grpc://10.120.27.6:8470,'
                                                     'grpc://10.120.27.7:8470,'
                                                     'grpc://10.120.27.8:8470')

    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
    self.assertTrue(TPUClusterResolver._inGke())
    self.assertEqual(
        compat.as_bytes('grpc://10.120.27.5:8470,'
                        'grpc://10.120.27.6:8470,'
                        'grpc://10.120.27.7:8470,'
                        'grpc://10.120.27.8:8470'),
        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))

    resolver = TPUClusterResolver()
    self.assertEqual(
        compat.as_bytes('grpc://10.120.27.5:8470'),
        compat.as_bytes(resolver.master()))
    actual_cluster_spec = resolver.cluster_spec()
    expected_proto = """
    job {
      name: 'worker'
      tasks { key: 0 value: '10.120.27.5:8470' }
      tasks { key: 1 value: '10.120.27.6:8470' }
      tasks { key: 2 value: '10.120.27.7:8470' }
      tasks { key: 3 value: '10.120.27.8:8470' }
    }
    """
    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)

    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']

Example #5

0

Show file

File: tpu_cluster_resolver_test.py Project: perfmjs/tensorflow

 def testNumAcceleratorsRetryFailure(self, mock_list_devices,
                                     mock_eager_list_devices):
   resolver = TPUClusterResolver(tpu='')
   mock_list_devices.side_effect = errors.DeadlineExceededError(
       None, None, 'timeout')
   mock_eager_list_devices.side_effect = errors.DeadlineExceededError(
       None, None, 'timeout')
   with self.assertRaises(RuntimeError):
     resolver.num_accelerators()

Example #6

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

 def verifyShouldResolve(self, tpu, should_resolve):
   resolver = TPUClusterResolver(
       project='test-project',
       zone='us-central1-c',
       tpu=tpu,
       coordinator_name=None,
       credentials=None,
       service=self.mock_service_client(tpu_map={}))
   self.assertEqual(should_resolve, resolver._shouldResolve(),
                    "TPU: '%s'" % tpu)

Example #7

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

 def testVerifySameCoreCount(self):
   self.assertEqual(
       TPUClusterResolver._verify_and_return_same_core_count(
           {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8)
   self.assertEqual(
       TPUClusterResolver._verify_and_return_same_core_count(
           {0: [0, 1], 1: [2, 3]}), 2)
   with self.assertRaises(RuntimeError):
     TPUClusterResolver._verify_and_return_same_core_count(
         {0: [0], 1: [1, 2]})

Example #8

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testPodResolution(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'health':
                'HEALTHY',
            'networkEndpoints': [
                {
                    'ipAddress': '10.2.3.4',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.5',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.6',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.7',
                    'port': 8470,
                },
            ]
        }
    }

    resolver = TPUClusterResolver(
        tpu='test-tpu-1',
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map),
        coordinator_name='coordinator')

    actual_cluster_spec = resolver.cluster_spec()
    expected_proto = """
    job {
      name: 'coordinator',
      tasks { key: 0 value: '10.128.1.2:%s'}
    }
    job {
      name: 'worker'
      tasks { key: 0 value: '10.2.3.4:8470' }
      tasks { key: 1 value: '10.2.3.5:8470' }
      tasks { key: 2 value: '10.2.3.6:8470' }
      tasks { key: 3 value: '10.2.3.7:8470' }
    }
    """ % resolver._coordinator_port
    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')

Example #9

0

Show file

File: tpu_strategy.py Project: jackd/tensorflow

def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices in a separate session and graph.

  Args:
    cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.contrib.tpu.Topology object for the topology of the TPU cluster.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  master = cluster_resolver.master()

  logging.info("Initializing the TPU system.")

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    # The easiest way to trigger a rewrite is to run the function with
    # TPUPartitionedCallOp.
    @function.defun
    def _tpu_init_fn():
      return tpu.initialize_system()

    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
    # see above) but need to define it to get it added to eager context
    # and get its assigned name.
    # pylint: disable=protected-access
    graph_func = _tpu_init_fn._get_concrete_function_internal()
    func_name = compat.as_str(graph_func._inference_function.name)
    # pylint: enable=protected-access

    output = tpu_functional_ops.TPUPartitionedCall(
        args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
    serialized_topology = output[0].numpy()
  else:
    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())

  logging.info("Finished initializing TPU system.")
  return topology.Topology(serialized=serialized_topology)

Example #10

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testNotReadyCloudTpu(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'ipAddress': '10.1.2.3',
            'port': '8470',
            'state': 'CREATING'
        }
    }

    resolver = TPUClusterResolver(
        project=None,
        zone=None,
        tpu='test-tpu-1',
        coordinator_name=None,
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map))

    with self.assertRaises(RuntimeError):
      resolver.cluster_spec()

Example #11

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testNumAcceleratorsSuccess(self, mock_list_devices):
    device_names = [
        '/job:tpu_worker/task:0/device:TPU:0',
        '/job:tpu_worker/task:1/device:TPU:1',
        '/job:tpu_worker/task:2/device:TPU:0',
        '/job:tpu_worker/task:3/device:TPU:1',
        '/job:tpu_worker/task:0/device:TPU:4',
        '/job:tpu_worker/task:1/device:TPU:5',
        '/job:tpu_worker/task:2/device:TPU:4',
        '/job:tpu_worker/task:3/device:TPU:5',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'TPU', 1024, 0) for name in device_names
    ]
    mock_list_devices.return_value = device_list

    resolver = TPUClusterResolver(tpu='')
    self.assertEqual(resolver.num_accelerators(), 2)

Example #12

0

Show file

File: tpu_strategy.py Project: ziky90/tensorflow

def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices in a separate session and graph.

  Args:
    cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.contrib.tpu.Topology object for the topology of the TPU cluster.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  master = cluster_resolver.master()

  logging.info("Initializing the TPU system.")
  session_config = config_pb2.ConfigProto(allow_soft_placement=True)

  with ops.Graph().as_default():
    with session_lib.Session(config=session_config, target=master) as sess:
      serialized_topology = sess.run(tpu.initialize_system())
  logging.info("Finished initializing TPU system.")
  return topology.Topology(serialized=serialized_topology)

Example #13

0

Show file

  def testGkeEnvironmentForPod(self):
    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,'
                                                     'grpc://10.120.27.6:8470,'
                                                     'grpc://10.120.27.7:8470,'
                                                     'grpc://10.120.27.8:8470')

    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
    self.assertTrue(TPUClusterResolver._inGke())
    self.assertEqual(
        compat.as_bytes('grpc://10.120.27.5:8470,'
                        'grpc://10.120.27.6:8470,'
                        'grpc://10.120.27.7:8470,'
                        'grpc://10.120.27.8:8470'),
        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))

    resolver = TPUClusterResolver()
    self.assertEqual(
        compat.as_bytes('grpc://10.120.27.5:8470'),
        compat.as_bytes(resolver.master()))
    actual_cluster_spec = resolver.cluster_spec()
    expected_proto = """
    job {
      name: 'worker'
      tasks { key: 0 value: '10.120.27.5:8470' }
      tasks { key: 1 value: '10.120.27.6:8470' }
      tasks { key: 2 value: '10.120.27.7:8470' }
      tasks { key: 3 value: '10.120.27.8:8470' }
    }
    """
    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)

    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']

Example #14

0

Show file

File: tpu_strategy.py Project: zhedassr/tensorflow

    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_function_cache = weakref.WeakKeyDictionary()
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        self._device_assignment = device_assignment

        # TODO(jhseu): Switch to DeviceAssignment to support pods and model
        # parallelism.
        self._tpu_devices = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0])

        # Only create variables for the number of replicas we're running.
        self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
        self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

        # Preload the data onto the TPUs.
        input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices:
            host_device = device_util.get_host_for_device(tpu_device)
            input_worker_devices.setdefault(host_device, [])
            input_worker_devices[host_device].append(tpu_device)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, tuple(input_worker_devices.items()))

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        self.experimental_enable_get_next_as_optional = True

Example #15

0

Show file

    def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
        tpu_map = {
            'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
                'ipAddress': '10.1.2.3',
                'port': '8470',
                'health': 'HEALTHY'
            }
        }

        resolver = TPUClusterResolver(
            project=None,
            zone=None,
            tpu=['test-tpu-1'],
            coordinator_name=None,
            credentials=None,
            service=self.mock_service_client(tpu_map=tpu_map))

        actual_cluster_spec = resolver.cluster_spec()
        expected_proto = """
    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
    """
        self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
        self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')

Example #16

0

Show file

    def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
        device_names = [
            '/job:tpu_worker/task:0/device:CPU:0',
            '/job:tpu_worker/task:1/device:CPU:0',
            '/job:tpu_worker/task:2/device:CPU:0',
            '/job:tpu_worker/task:3/device:CPU:0',
            '/job:tpu_worker/task:0/device:GPU:1',
            '/job:tpu_worker/task:1/device:GPU:1',
            '/job:tpu_worker/task:2/device:GPU:1',
            '/job:tpu_worker/task:3/device:GPU:1',
        ]
        device_list = [
            session._DeviceAttributes(name, 'XLA', 1024, 0)
            for name in device_names
        ]

        device_dict, num_cores = TPUClusterResolver._get_device_dict_and_cores(
            device_list)
        self.assertEqual(num_cores, 0)
        self.assertEqual(device_dict, {})

Example #17

0

Show file

File: tpu_cluster_resolver_test.py Project: kmader/tensorflow-1

  def testOverrideTaskTypeAndIndexAndGetMaster(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'health':
                'HEALTHY',
            'networkEndpoints': [
                {
                    'ipAddress': '10.2.3.4',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.5',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.6',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.7',
                    'port': 8470,
                },
            ]
        }
    }

    resolver = TPUClusterResolver(
        project='test-project',
        zone='us-central1-c',
        tpu='test-tpu-1',
        coordinator_name=None,
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map))

    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')

    resolver.task_type = 'worker'
    resolver.task_id = 3
    self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')

    self.assertEqual(
        resolver.master(
            task_type='worker', task_id=2, rpc_layer='test'),
        'test://10.2.3.6:8470')

Example #18

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
    device_names = [
        '/job:tpu_worker/task:0/device:CPU:0',
        '/job:tpu_worker/task:1/device:CPU:0',
        '/job:tpu_worker/task:2/device:CPU:0',
        '/job:tpu_worker/task:3/device:CPU:0',
        '/job:tpu_worker/task:0/device:GPU:1',
        '/job:tpu_worker/task:1/device:GPU:1',
        '/job:tpu_worker/task:2/device:GPU:1',
        '/job:tpu_worker/task:3/device:GPU:1',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'XLA', 1024, 0) for name in device_names
    ]

    device_dict, num_cores = TPUClusterResolver._get_device_dict_and_cores(
        device_list)
    self.assertEqual(num_cores, 0)
    self.assertEqual(device_dict, {})

Example #19

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testOverrideTaskTypeAndIndexAndGetMaster(self):
    tpu_map = {
        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
            'health':
                'HEALTHY',
            'networkEndpoints': [
                {
                    'ipAddress': '10.2.3.4',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.5',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.6',
                    'port': 8470,
                },
                {
                    'ipAddress': '10.2.3.7',
                    'port': 8470,
                },
            ]
        }
    }

    resolver = TPUClusterResolver(
        project='test-project',
        zone='us-central1-c',
        tpu='test-tpu-1',
        coordinator_name=None,
        credentials=None,
        service=self.mock_service_client(tpu_map=tpu_map))

    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')

    resolver.task_type = 'worker'
    resolver.task_id = 3
    self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')

    self.assertEqual(
        resolver.master(
            task_type='worker', task_id=2, rpc_layer='test'),
        'test://10.2.3.6:8470')

Example #20

0

Show file

File: tpu_cluster_resolver_test.py Project: kmader/tensorflow-1

  def testGetDeviceDictAndCoresWithTPUs(self):
    device_names = [
        '/job:tpu_worker/task:0/device:TPU:0',
        '/job:tpu_worker/task:1/device:TPU:1',
        '/job:tpu_worker/task:2/device:TPU:0',
        '/job:tpu_worker/task:3/device:TPU:1',
        '/job:tpu_worker/task:0/device:TPU:4',
        '/job:tpu_worker/task:1/device:TPU:5',
        '/job:tpu_worker/task:2/device:TPU:4',
        '/job:tpu_worker/task:3/device:TPU:5',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'TPU', 1024, 0) for name in device_names
    ]

    device_details = TPUClusterResolver._get_device_dict_and_cores(
        device_list)
    self.assertEqual(device_details.total_cores, 8)
    self.assertEqual(device_details.device_map,
                     {'0': ['0', '4'],
                      '1': ['1', '5'],
                      '2': ['0', '4'],
                      '3': ['1', '5']})

Example #21

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

  def testGetDeviceDictAndCoresWithTPUs(self):
    device_names = [
        '/job:tpu_worker/task:0/device:TPU:0',
        '/job:tpu_worker/task:1/device:TPU:1',
        '/job:tpu_worker/task:2/device:TPU:0',
        '/job:tpu_worker/task:3/device:TPU:1',
        '/job:tpu_worker/task:0/device:TPU:4',
        '/job:tpu_worker/task:1/device:TPU:5',
        '/job:tpu_worker/task:2/device:TPU:4',
        '/job:tpu_worker/task:3/device:TPU:5',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'TPU', 1024, 0) for name in device_names
    ]

    device_details = TPUClusterResolver._get_device_dict_and_cores(
        device_list)
    self.assertEqual(device_details.total_cores, 8)
    self.assertEqual(device_details.device_map,
                     {'0': ['0', '4'],
                      '1': ['1', '5'],
                      '2': ['0', '4'],
                      '3': ['1', '5']})

Example #22

0

Show file

def shutdown_tpu_system(cluster_resolver=None):
    """Shuts down the TPU devices.

  This will clear all caches, even those that are maintained through sequential
  calls to tf.tpu.experimental.initialize_tpu_system, such as the compilation
  cache.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution or if run in a
        tf.function.
  """
    job = None
    if cluster_resolver is None:
        # If no cluster resolver is specified, and running eagerly, execute the init
        # ops in the current device scope.
        if context.executing_eagerly():
            curr_device = device.DeviceSpec.from_string(
                context.context().device_name)
            if curr_device.job is not None:
                job = "{}/replica:0/task:0".format(curr_device.job)

        cluster_resolver = TPUClusterResolver("")
    assert isinstance(cluster_resolver, TPUClusterResolver)

    tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
    if tpu_name not in _INITIALIZED_TPU_SYSTEMS:
        logging.warning(
            "You are shutting down a TPU system %s that has not been "
            "initialized.")

    logging.info("Shutting down the TPU system: %s", tpu_name)

    if context.executing_eagerly():
        # This function looks as it is for the following non-intuitive reasons.
        # tpu.shutdown_system creates a dummy op whose sole purpose is to trigger
        # DistributedTPURewritePass. This pass actually adds real ops that
        # shutdown the TPU system. Thus, we can't simply run tpu.shutdown_system
        # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
        if tpu_name not in _LOCAL_MASTERS:
            # Explicitly place the tpu.shutdown_system in the first worker to
            # avoid the output node match multiple devices error.
            job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())

        @function.defun
        def _tpu_shutdown_fn():
            tpu.shutdown_system(job=job)

        # The TPU_SYSTEM device must match the device used in tpu.shutdown_system
        # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
        # devices available.
        with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
            _tpu_shutdown_fn()

        # Clear out the eager context caches since the memory is invalid now.
        logging.info("Clearing out eager caches")
        context.context()._clear_caches()  # pylint: disable=protected-access
    elif not ops.executing_eagerly_outside_functions():
        master = cluster_resolver.master()
        cluster_spec = cluster_resolver.cluster_spec()

        session_config = config_pb2.ConfigProto(allow_soft_placement=True)
        if cluster_spec:
            session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())

        with ops.Graph().as_default():
            with session_lib.Session(config=session_config,
                                     target=master) as sess:
                sess.run(tpu.shutdown_system())
    else:
        raise RuntimeError("initialize_tpu_system is not supported within "
                           "tf.functions.")

    logging.info("Finished shutting down TPU system.")
    if tpu_name in _INITIALIZED_TPU_SYSTEMS:
        del _INITIALIZED_TPU_SYSTEMS[tpu_name]

Example #23

0

Show file

File: tpu_cluster_resolver_test.py Project: kmader/tensorflow-1

 def testLocalhostMaster(self):
   resolver = TPUClusterResolver(tpu='localhost:12345')
   self.assertEqual('localhost:12345', resolver.master())

Example #24

0

Show file

 def testNumAcceleratorsRetryFailure(self, mock_list_devices):
     resolver = TPUClusterResolver(tpu='')
     mock_list_devices.side_effect = errors.DeadlineExceededError(
         None, None, 'timeout')
     with self.assertRaises(RuntimeError):
         resolver.num_accelerators()

Example #25

0

Show file

 def testNoCallComputeMetadata(self):
   tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
   self.assertEqual(
       compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
   self.assertEqual(None, tpu_cluster_resolver.cluster_spec())

Example #26

0

Show file

 def testEnvironmentAndRpcDetectionForGrpcString(self):
     resolver = TPUClusterResolver(tpu='grpc://10.1.2.3:8470')
     self.assertEqual(resolver.environment, '')
     self.assertEqual(resolver.rpc_layer, 'grpc')
     self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')

Example #27

0

Show file

File: tpu_strategy_util.py Project: flavz27/master_PA

def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  assert isinstance(cluster_resolver, TPUClusterResolver)

  tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
  if tpu_name in _INITIALIZED_TPU_SYSTEMS:
    logging.warning("TPU system %s has already been initialized. "
                    "Reinitializing the TPU can cause previously created "
                    "variables on TPU to be lost.")

  logging.info("Initializing the TPU system.")

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    @function.defun
    def _tpu_init_fn():
      return tpu.initialize_system()

    tpu_devices = sorted(
        [x for x in context.list_devices() if "device:TPU:" in x])

    if not tpu_devices:
      raise RuntimeError("Could not find any TPU devices")

    # Replace the remote TPU device with the remote TPU_SYSTEM system device. As
    # in the remote TPU device case, we will try to compile it instead of
    # running through optimization passes and TF Executor, but TPU_SYSTEM should
    # work.
    tpu_system_device = tpu_devices[0].replace("TPU", "TPU_SYSTEM")

    with ops.device(tpu_system_device):
      output = _tpu_init_fn()
    serialized_topology = output.numpy()
  else:
    master = cluster_resolver.master()
    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())

  logging.info("Finished initializing TPU system.")
  tpu_topology = topology.Topology(serialized=serialized_topology)
  _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

  return tpu_topology

Example #28

0

Show file

File: tpu_cluster_resolver_test.py Project: aeverall/tensorflow

 def testEnvironmentDiscoveryUrl(self):
   os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
   self.assertEqual('https://{api}.internal/{apiVersion}',
                    TPUClusterResolver._environmentDiscoveryUrl())

Example #29

0

Show file

 def testIsNotRunningInGce(self):
     self.assertFalse(TPUClusterResolver._isRunningInGCE())

Example #30

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

 def testIsRunningInGce(self):
   self.assertTrue(TPUClusterResolver._isRunningInGCE())

Example #31

0

Show file

 def testCheckRunningInGceWithNoTpuName(self):
     with self.assertRaisesRegexp(RuntimeError, '.*Google Cloud.*'):
         TPUClusterResolver(tpu='')

Example #32

0

Show file

 def testIsRunningInGce(self):
     self.assertTrue(TPUClusterResolver._isRunningInGCE())

Example #33

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

 def testIsNotRunningInGce(self):
   self.assertFalse(TPUClusterResolver._isRunningInGCE())

Example #34

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

 def testNoCallComputeMetadata(self):
   resolver = TPUClusterResolver(
       tpu='/bns/foo/bar')
   self.assertEqual('/bns/foo/bar', resolver.master())
   self.assertEqual(None, resolver.cluster_spec())

Example #35

0

Show file

File: tpu_cluster_resolver_test.py Project: terrytangyuan/tensorflow

 def testEnvironmentAndRpcDetectionForGrpcString(self):
   resolver = TPUClusterResolver(
       tpu='grpc://10.1.2.3:8470')
   self.assertEqual(resolver.environment, '')
   self.assertEqual(resolver.rpc_layer, 'grpc')
   self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')

Example #36

0

Show file

File: tpu_strategy.py Project: Lucioric2000/tensorflow

    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        # `self._tpu_function_cache` is a dict of `tf.function`s, thus if a
        # `tf.function` is passed into `strategy.run` in eager mode, the
        # `tf.function` won't get retraced.
        self._tpu_function_cache = weakref.WeakKeyDictionary()

        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata(
        )
        self._device_assignment = device_assignment

        tpu_devices_flat = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is
        # indexed using `[replica_id][logical_device_id]`.
        if device_assignment is None:
            self._tpu_devices = np.array([[d] for d in tpu_devices_flat],
                                         dtype=object)
        else:
            job_name = device_spec.DeviceSpecV2.from_string(
                tpu_devices_flat[0]).job

            tpu_devices = []
            for replica_id in range(device_assignment.num_replicas):
                replica_devices = []

                for logical_core in range(
                        device_assignment.num_cores_per_replica):
                    replica_devices.append(
                        device_util.canonicalize(
                            device_assignment.tpu_device(
                                replica=replica_id,
                                logical_core=logical_core,
                                job=job_name)))

                tpu_devices.append(replica_devices)
            self._tpu_devices = np.array(tpu_devices, dtype=object)

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0][0])

        # Preload the data onto the TPUs. Currently we always preload onto logical
        # device 0 for each replica.
        # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the
        # input onto a different logical device?
        self._device_input_worker_devices = collections.OrderedDict()
        self._host_input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices[:, 0]:
            host_device = device_util.get_host_for_device(tpu_device)
            self._device_input_worker_devices.setdefault(host_device, [])
            self._device_input_worker_devices[host_device].append(tpu_device)
            self._host_input_worker_devices.setdefault(host_device, [])
            self._host_input_worker_devices[host_device].append(host_device)

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        self.experimental_enable_get_next_as_optional = True

        self._logical_device_stack = [0]

        if context.executing_eagerly():
            # In async remote eager, we want to sync the executors before exiting the
            # program.
            def async_wait():
                if context.context()._context_handle is not None:  # pylint: disable=protected-access
                    context.async_wait()

            atexit.register(async_wait)

        # Flag to turn on VariablePolicy
        self._use_var_policy = False

Example #37

0

Show file

 def testNoCallComputeMetadata(self):
     resolver = TPUClusterResolver(tpu='/bns/foo/bar')
     self.assertEqual('/bns/foo/bar', resolver.master())
     self.assertEqual(None, resolver.cluster_spec())

Example #38

0

Show file

def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution or if run in a
        tf.function.
  """
  job = None
  if cluster_resolver is None:
    # If no cluster resolver is specified, and running eagerly, execute the init
    # ops in the current device scope.
    if context.executing_eagerly():
      curr_device = device.DeviceSpec.from_string(context.context().device_name)
      if curr_device.job is not None:
        job = "{}/replica:0/task:0".format(curr_device.job)

    cluster_resolver = TPUClusterResolver("")
  assert isinstance(cluster_resolver, TPUClusterResolver)

  tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
  if tpu_name in _INITIALIZED_TPU_SYSTEMS:
    logging.warning(
        "TPU system %s has already been initialized. "
        "Reinitializing the TPU can cause previously created "
        "variables on TPU to be lost.", tpu_name)

  logging.info("Initializing the TPU system: %s", tpu_name)

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    if tpu_name not in _LOCAL_MASTERS:
      # Explicitly place the tpu.initialize_system in the first worker to
      # avoid the output node match multiple devices error.
      job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())

    @function.defun
    def _tpu_init_fn():
      # In TF1, we usually close chips when compilation fails to clear the data
      # in infeed. In TF2, we don't need to do this because infeed is no longer
      # used, so user can recover from TPU compilation failures more smoothly.
      return tpu.initialize_system(
          job=job, compilation_failure_closes_chips=False)

    # The TPU_SYSTEM device must match the device used in tpu.initialize_system
    # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
    # devices available.
    with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
      output = _tpu_init_fn()

    # Clear out the eager context caches since the memory is invalid now.
    logging.info("Clearing out eager caches")
    context.context()._clear_caches()  # pylint: disable=protected-access

    serialized_topology = output.numpy()

    # TODO(b/134094971): Remove this when lazy tensor copy in multi-device
    # function has been implemented.
    context.context().mirroring_policy = context.MIRRORING_ALL
  elif not ops.executing_eagerly_outside_functions():
    master = cluster_resolver.master()
    cluster_spec = cluster_resolver.cluster_spec()

    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    if cluster_spec:
      session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())

    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())
  else:
    raise RuntimeError("initialize_tpu_system is not supported within "
                       "tf.functions.")

  logging.info("Finished initializing TPU system.")
  tpu_topology = topology.Topology(serialized=serialized_topology)
  _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

  return tpu_topology

Example #39

0

Show file

 def testEnvironmentDiscoveryUrl(self):
     os.environ[
         'TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
                      (TPUClusterResolver._environmentDiscoveryUrl()))

Example #40

0

Show file

    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_function_cache = weakref.WeakKeyDictionary()
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        self._device_assignment = device_assignment

        self._tpu_devices = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        # Only create variables for the number of replicas we're running.
        if device_assignment is not None:
            job_name = device_spec.DeviceSpecV2.from_string(
                self._tpu_devices[0]).job

            self._tpu_devices = []
            for replica_id in range(device_assignment.num_replicas):
                tpu_device = device_assignment.tpu_device(replica=replica_id,
                                                          logical_core=0,
                                                          job=job_name)
                tpu_device = device_util.canonicalize(tpu_device)
                self._tpu_devices.append(tpu_device)

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0])

        self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

        # Preload the data onto the TPUs.
        input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices:
            host_device = device_util.get_host_for_device(tpu_device)
            input_worker_devices.setdefault(host_device, [])
            input_worker_devices[host_device].append(tpu_device)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, tuple(input_worker_devices.items()))

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't
        # need to retrace functions for each device.
        self._retrace_functions_for_each_device = False

        self.experimental_enable_get_next_as_optional = True
        self.experimental_enable_dynamic_batch_size = True

Example #41

0

Show file

 def testEnvironmentAndRpcDetectionForGoogle(self):
     resolver = TPUClusterResolver(tpu='/bns/ab/cd/ef')
     self.assertEqual(resolver.environment, 'google')
     self.assertEqual(resolver.rpc_layer, None)

Example #42

0

Show file

File: tpu_strategy_util.py Project: SiyingLyu/android_papar

def initialize_tpu_system(cluster_resolver=None):
    """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
    if cluster_resolver is None:
        cluster_resolver = TPUClusterResolver("")
    assert isinstance(cluster_resolver, TPUClusterResolver)

    tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
    if tpu_name in _INITIALIZED_TPU_SYSTEMS:
        logging.warning("TPU system %s has already been initialized. "
                        "Reinitializing the TPU can cause previously created "
                        "variables on TPU to be lost.")

    logging.info("Initializing the TPU system: %s", tpu_name)

    if context.executing_eagerly():
        # This function looks as it is for the following non-intuitive reasons.
        # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
        # DistributedTPURewritePass. This pass actually adds real ops that
        # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
        # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
        job = None
        if tpu_name not in _LOCAL_MASTERS:
            # Explicitly place the tpu.initialize_system in the first worker to
            # avoid the output node match multiple devices error.
            job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())

        @function.defun
        def _tpu_init_fn():
            return tpu.initialize_system(job=job)

        # The TPU_SYSTEM device must match the device used in tpu.initialize_system
        # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
        # devices available.
        with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
            output = _tpu_init_fn()

        # Clear out the eager context caches since the memory is invalid now.
        logging.info("Clearing out eager caches")
        context.context()._clear_caches()  # pylint: disable=protected-access

        serialized_topology = output.numpy()
    else:
        master = cluster_resolver.master()
        cluster_spec = cluster_resolver.cluster_spec()

        session_config = config_pb2.ConfigProto(allow_soft_placement=True)
        if cluster_spec:
            session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())

        with ops.Graph().as_default():
            with session_lib.Session(config=session_config,
                                     target=master) as sess:
                serialized_topology = sess.run(tpu.initialize_system())

    logging.info("Finished initializing TPU system.")
    tpu_topology = topology.Topology(serialized=serialized_topology)
    _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

    return tpu_topology

Example #43

0

Show file

    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_function_cache = weakref.WeakKeyDictionary()
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        self._device_assignment = device_assignment

        tpu_devices_flat = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is
        # indexed using `[replica_id][logical_device_id]`.
        if device_assignment is None:
            self._tpu_devices = np.array([[d] for d in tpu_devices_flat],
                                         dtype=object)
        else:
            job_name = device_spec.DeviceSpecV2.from_string(
                tpu_devices_flat[0]).job

            tpu_devices = []
            for replica_id in range(device_assignment.num_replicas):
                replica_devices = []

                for logical_core in range(
                        device_assignment.num_cores_per_replica):
                    replica_devices.append(
                        device_util.canonicalize(
                            device_assignment.tpu_device(
                                replica=replica_id,
                                logical_core=logical_core,
                                job=job_name)))

                tpu_devices.append(replica_devices)
            self._tpu_devices = np.array(tpu_devices, dtype=object)

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0][0])

        # Preload the data onto the TPUs. Currently we always preload onto logical
        # device 0 for each replica.
        # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the
        # input onto a different logical device?
        input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices[:, 0]:
            host_device = device_util.get_host_for_device(tpu_device)
            input_worker_devices.setdefault(host_device, [])
            input_worker_devices[host_device].append(tpu_device)
        self._input_worker_devices = tuple(input_worker_devices.items())
        self._input_workers_obj = None

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't
        # need to retrace functions for each device.
        self._retrace_functions_for_each_device = False

        self.experimental_enable_get_next_as_optional = True
        self.experimental_enable_dynamic_batch_size = True
        self._prefetch_on_host = False

        self._logical_device_stack = [0]

Example #44

0

Show file

    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        self._device_assignment = device_assignment

        # Device assignment is currently only supported for 1 core case.
        if self._device_assignment:
            assert isinstance(self._device_assignment,
                              device_assignment_lib.DeviceAssignment)
            if self._device_assignment.num_replicas != 1:
                raise ValueError(
                    "Device assignment is only supported for a single "
                    "core single replica case currently.")
            if self._device_assignment.num_cores_per_replica != 1:
                raise ValueError(
                    "Device assignment is only supported for a single "
                    "core single replica case currently.")
            if not all(self._device_assignment.core_assignment[0][0] ==
                       [0, 0, 0]):
                raise ValueError(
                    "Device assignment is only supported for a single "
                    "core single replica case currently.")

        # TODO(jhseu): Switch to DeviceAssignment to support pods and model
        # parallelism.
        self._tpu_devices = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0])

        # Only create variables for the number of replicas we're running.
        self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
        self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

        # Preload the data onto the TPUs.
        input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices:
            host_device = device_util.get_host_for_device(tpu_device)
            input_worker_devices.setdefault(host_device, [])
            input_worker_devices[host_device].append(tpu_device)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, tuple(input_worker_devices.items()))

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

Example #45

0

Show file

def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  assert isinstance(cluster_resolver, TPUClusterResolver)

  tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
  if tpu_name in _INITIALIZED_TPU_SYSTEMS:
    logging.warning("TPU system %s has already been initialized. "
                    "Reinitializing the TPU can cause previously created "
                    "variables on TPU to be lost.")

  logging.info("Initializing the TPU system.")

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    # The easiest way to trigger a rewrite is to run the function with
    # TPUPartitionedCallOp.
    @function.defun
    def _tpu_init_fn():
      return tpu.initialize_system()

    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
    # see above) but need to define it to get it added to eager context
    # and get its assigned name.
    # pylint: disable=protected-access
    graph_func = _tpu_init_fn._get_concrete_function_internal()
    func_name = compat.as_str(graph_func._inference_function.name)
    # pylint: enable=protected-access

    tpu_devices = sorted(
        [x for x in context.list_devices() if "device:TPU:" in x])

    if not tpu_devices:
      raise RuntimeError("Could not find any TPU devices")

    with ops.device(device_util.get_host_for_device(tpu_devices[0])):
      output = tpu_functional_ops.TPUPartitionedCall(
          args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
    serialized_topology = output[0].numpy()
  else:
    master = cluster_resolver.master()
    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())

  logging.info("Finished initializing TPU system.")
  tpu_topology = topology.Topology(serialized=serialized_topology)
  _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

  return tpu_topology