Esempio n. 1
0
    def testNumAcceleratorsFilterTasks(self, mock_list_devices,
                                       mock_eager_list_devices):
        devices = [
            LogicalDevice("/job:worker1/task:0/device:TPU:0", "TPU"),
            LogicalDevice("/job:worker1/task:0/device:TPU:1", "TPU"),
            LogicalDevice("/job:worker1/task:0/device:GPU:0", "GPU"),
            LogicalDevice("/job:worker1/task:0/device:GPU:1", "GPU"),
            LogicalDevice("/job:worker2/task:1/device:TPU:2", "TPU"),
            LogicalDevice("/job:worker2/task:2/device:TPU:3", "TPU"),
            LogicalDevice("/job:worker2/task:3/device:GPU:2", "GPU"),
            LogicalDevice("/job:worker2/task:4/device:GPU:3", "GPU"),
        ]
        device_list = [
            session._DeviceAttributes(d.name, d.device_type, 1024, 0)
            for d in devices
        ]
        mock_eager_list_devices.return_value = devices
        mock_list_devices.return_value = device_list

        resolver = MockBaseClusterResolver()
        self.assertEqual(
            resolver.num_accelerators(task_type="worker1", task_id=0), {
                "TPU": 2,
                "GPU": 2
            })
        self.assertEqual(
            resolver.num_accelerators(task_type="worker2", task_id=3),
            {"GPU": 1})
        self.assertEqual(
            resolver.num_accelerators(task_type="worker2", task_id=4),
            {"GPU": 1})
Esempio n. 2
0
    def testGetDeviceDictAndCoresWithTPUs(self):
        device_names = [
            '/job:tpu_worker/task:0/device:TPU:0',
            '/job:tpu_worker/task:1/device:TPU:1',
            '/job:tpu_worker/task:2/device:TPU:0',
            '/job:tpu_worker/task:3/device:TPU:1',
            '/job:tpu_worker/task:0/device:TPU:4',
            '/job:tpu_worker/task:1/device:TPU:5',
            '/job:tpu_worker/task:2/device:TPU:4',
            '/job:tpu_worker/task:3/device:TPU:5',
        ]
        device_list = [
            session._DeviceAttributes(name, 'TPU', 1024, 0)
            for name in device_names
        ]

        device_details = tpu_cluster_resolver._get_device_dict_and_cores(
            device_list)
        self.assertEqual(device_details.total_cores, 8)
        self.assertEqual(device_details.device_map, {
            '0': ['0', '4'],
            '1': ['1', '5'],
            '2': ['0', '4'],
            '3': ['1', '5']
        })
  def testNumAcceleratorsFilterTasks(self, mock_list_devices,
                                     mock_eager_list_devices):
    device_names = [
        "/job:worker1/task:0/device:TPU:0",
        "/job:worker1/task:0/device:TPU:1",
        "/job:worker1/task:0/device:GPU:0",
        "/job:worker1/task:0/device:GPU:1",
        "/job:worker2/task:1/device:TPU:2",
        "/job:worker2/task:2/device:TPU:3",
        "/job:worker2/task:3/device:GPU:2",
        "/job:worker2/task:4/device:GPU:3",
    ]
    device_list = [
        session._DeviceAttributes(name, name[27:30], 1024, 0)
        for name in device_names
    ]
    mock_eager_list_devices.return_value = device_names
    mock_list_devices.return_value = device_list

    resolver = MockBaseClusterResolver()
    self.assertEqual(resolver.num_accelerators(task_type="worker1", task_id=0),
                     {"TPU": 2, "GPU": 2})
    self.assertEqual(resolver.num_accelerators(task_type="worker2", task_id=3),
                     {"GPU": 1})
    self.assertEqual(resolver.num_accelerators(task_type="worker2", task_id=4),
                     {"GPU": 1})
Esempio n. 4
0
    def testNumAcceleratorsSuccess(self, mock_list_devices,
                                   mock_eager_list_devices):
        devices = [
            LogicalDevice('/job:tpu_worker/task:0/device:TPU:0', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:1/device:TPU:1', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:2/device:TPU:0', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:3/device:TPU:1', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:0/device:TPU:4', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:1/device:TPU:5', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:2/device:TPU:4', 'TPU'),
            LogicalDevice('/job:tpu_worker/task:3/device:TPU:5', 'TPU'),
        ]
        device_list = [
            session._DeviceAttributes(d.name, d.device_type, 1024, 0)
            for d in devices
        ]
        mock_eager_list_devices.return_value = devices
        mock_list_devices.return_value = device_list

        tpu_map = {
            'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
                'state':
                'READY',
                'health':
                'HEALTHY',
                'networkEndpoints': [
                    {
                        'ipAddress': '10.2.3.4',
                        'port': 8470,
                    },
                    {
                        'ipAddress': '10.2.3.5',
                        'port': 8470,
                    },
                    {
                        'ipAddress': '10.2.3.6',
                        'port': 8470,
                    },
                    {
                        'ipAddress': '10.2.3.7',
                        'port': 8470,
                    },
                ]
            }
        }

        cluster_resolver = resolver.TPUClusterResolver(
            project='test-project',
            zone='us-central1-c',
            tpu='test-tpu-1',
            service=self.mock_service_client(tpu_map=tpu_map))
        self.assertEqual(cluster_resolver.num_accelerators(), {'TPU': 2})
  def testNumAcceleratorsFilterSuccess(self, mock_list_devices):
    device_names = [
        "/job:worker/task:0/device:TPU:0",
        "/job:worker/task:0/device:TPU:1",
        "/job:worker/task:0/device:TPU:2",
        "/job:worker/task:0/device:TPU:3",
    ]
    device_list = [
        session._DeviceAttributes(
            name, "TPU", 1024, 0) for name in device_names
    ]
    mock_list_devices.return_value = device_list

    resolver = MockBaseClusterResolver()
    self.assertEqual(resolver.num_accelerators(), 0)
    def testNumAcceleratorsFilterSuccess(self, mock_list_devices):
        device_names = [
            "/job:worker/task:0/device:TPU:0",
            "/job:worker/task:0/device:TPU:1",
            "/job:worker/task:0/device:TPU:2",
            "/job:worker/task:0/device:TPU:3",
        ]
        device_list = [
            session._DeviceAttributes(name, "TPU", 1024, 0)
            for name in device_names
        ]
        mock_list_devices.return_value = device_list

        resolver = MockBaseClusterResolver()
        self.assertEqual(resolver.num_accelerators(), 0)
Esempio n. 7
0
def get_accelerator_devices(master, config_proto):
    """Returns accelerator devices given a master and a configuration."""
    if context.executing_eagerly():
        logical_devices = config.list_logical_devices()
        devices = []
        for d in logical_devices:
            if d.device_type == 'CPU' or d.device_type == 'XLA_CPU':  # Filter CPUs
                continue
            devices.append(
                session._DeviceAttributes(d.name, d.device_type, 0, 0))  # pylint: disable=protected-access
        return devices
    else:
        with ops.Graph().as_default():
            with session.Session(master, config=config_proto) as s:
                devices = s.list_devices()
        return devices
Esempio n. 8
0
    def testNumAcceleratorsSuccess(self, mock_list_devices,
                                   mock_eager_list_devices):
        devices = [
            LogicalDevice("/job:worker/task:0/device:GPU:0", "GPU"),
            LogicalDevice("/job:worker/task:0/device:GPU:1", "GPU"),
            LogicalDevice("/job:worker/task:0/device:GPU:2", "GPU"),
            LogicalDevice("/job:worker/task:0/device:GPU:3", "GPU"),
        ]
        device_list = [
            session._DeviceAttributes(d.name, d.device_type, 1024, 0)
            for d in devices
        ]
        mock_eager_list_devices.return_value = devices
        mock_list_devices.return_value = device_list

        resolver = MockBaseClusterResolver()
        self.assertEqual(resolver.num_accelerators(), {"GPU": 4})
Esempio n. 9
0
def list_devices(session):
    with errors.raise_exception_on_not_ok_status() as status:
        if session._created_with_new_api:
            raw_device_list = tf_session.TF_SessionListDevices(
                session._session, status)
        else:
            raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
                session._session, status)
        device_list = []
        size = tf_session.TF_DeviceListCount(raw_device_list)
        for i in range(size):
            name = tf_session.TF_DeviceListName(raw_device_list, i, status)
            device_type = tf_session.TF_DeviceListType(raw_device_list, i, status)
            memory = 0   # tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status)
            device_list.append(_DeviceAttributes(name, device_type, memory))
        tf_session.TF_DeleteDeviceList(raw_device_list)
        return device_list
  def testNumAcceleratorsSuccess(self, mock_list_devices):
    device_names = [
        '/job:tpu_worker/task:0/device:TPU:0',
        '/job:tpu_worker/task:1/device:TPU:1',
        '/job:tpu_worker/task:2/device:TPU:0',
        '/job:tpu_worker/task:3/device:TPU:1',
        '/job:tpu_worker/task:0/device:TPU:4',
        '/job:tpu_worker/task:1/device:TPU:5',
        '/job:tpu_worker/task:2/device:TPU:4',
        '/job:tpu_worker/task:3/device:TPU:5',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'TPU', 1024, 0) for name in device_names
    ]
    mock_list_devices.return_value = device_list

    resolver = cluster_resolver.TPUClusterResolver(tpu='')
    self.assertEqual(resolver.num_accelerators(), 2)
Esempio n. 11
0
def get_accelerator_devices(master, config_proto):
  """Returns accelerator devices given a master and a configuration."""
  if context.executing_eagerly():
    device_names = context.list_devices()  # list_devices returns list(string)
    devices = []
    for name in device_names:
      device_type = 'GPU'  # default device type is GPU
      device_match = DEVICE_TYPE_REGEX.match(name)
      if device_match:
        device_type = device_match.group(1)
      if device_type == 'CPU' or device_type == 'XLA_CPU':  # Filter CPUs
        continue
      devices.append(session._DeviceAttributes(name, device_type, 0, 0))  # pylint: disable=protected-access
    return devices
  else:
    with ops.Graph().as_default():
      with session.Session(master, config=config_proto) as s:
        devices = s.list_devices()
    return devices
Esempio n. 12
0
    def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
        devices = [
            '/job:tpu_worker/task:0/device:CPU:0',
            '/job:tpu_worker/task:1/device:CPU:0',
            '/job:tpu_worker/task:2/device:CPU:0',
            '/job:tpu_worker/task:3/device:CPU:0',
            '/job:tpu_worker/task:0/device:GPU:1',
            '/job:tpu_worker/task:1/device:GPU:1',
            '/job:tpu_worker/task:2/device:GPU:1',
            '/job:tpu_worker/task:3/device:GPU:1',
        ]
        device_list = [
            session._DeviceAttributes(name, 'XLA', 1024, 0) for name in devices
        ]

        device_dict, num_cores =\
            resolver.TPUClusterResolver._get_device_dict_and_cores(device_list)
        self.assertEqual(num_cores, 0)
        self.assertEqual(device_dict, {})
Esempio n. 13
0
    def testNumAcceleratorsSuccess(self, mock_list_devices):
        device_names = [
            '/job:tpu_worker/task:0/device:TPU:0',
            '/job:tpu_worker/task:1/device:TPU:1',
            '/job:tpu_worker/task:2/device:TPU:0',
            '/job:tpu_worker/task:3/device:TPU:1',
            '/job:tpu_worker/task:0/device:TPU:4',
            '/job:tpu_worker/task:1/device:TPU:5',
            '/job:tpu_worker/task:2/device:TPU:4',
            '/job:tpu_worker/task:3/device:TPU:5',
        ]
        device_list = [
            session._DeviceAttributes(name, 'TPU', 1024, 0)
            for name in device_names
        ]
        mock_list_devices.return_value = device_list

        resolver = cluster_resolver.TPUClusterResolver(tpu='')
        self.assertEqual(resolver.num_accelerators(), 2)
    def testNumAcceleratorsFilterTasksByEnvVar(self, mock_list_devices,
                                               mock_eager_list_devices):
        os.environ['TF_CONFIG'] = """
    {
      "cluster": {
        "worker1": ["w10:2222"],
        "worker2": ["w21:2222", "w22:2222", "w23:2222", "w24:2222"]
      },
      "rpc_layer": "grpc",
      "task": {
        "type": "worker1",
        "index": "0"
      }
    }
    """

        devices = [
            LogicalDevice('/job:worker1/task:0/device:TPU:0', 'TPU'),
            LogicalDevice('/job:worker1/task:0/device:TPU:1', 'TPU'),
            LogicalDevice('/job:worker1/task:0/device:GPU:0', 'GPU'),
            LogicalDevice('/job:worker1/task:0/device:GPU:1', 'GPU'),
            LogicalDevice('/job:worker2/task:1/device:TPU:2', 'TPU'),
            LogicalDevice('/job:worker2/task:2/device:TPU:3', 'TPU'),
            LogicalDevice('/job:worker2/task:3/device:GPU:2', 'GPU'),
            LogicalDevice('/job:worker2/task:4/device:GPU:3', 'GPU'),
        ]
        device_list = [
            session._DeviceAttributes(d.name, d.device_type, 1024, 0)
            for d in devices
        ]
        mock_eager_list_devices.return_value = devices
        mock_list_devices.return_value = device_list

        resolver = TFConfigClusterResolver()

        # By default we read from TF_CONFIG
        self.assertEqual(resolver.num_accelerators(), {'TPU': 2, 'GPU': 2})

        # Override still works when we want it to
        self.assertEqual(
            resolver.num_accelerators(task_type='worker2', task_id=3),
            {'GPU': 1})
  def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
    device_names = [
        '/job:tpu_worker/task:0/device:CPU:0',
        '/job:tpu_worker/task:1/device:CPU:0',
        '/job:tpu_worker/task:2/device:CPU:0',
        '/job:tpu_worker/task:3/device:CPU:0',
        '/job:tpu_worker/task:0/device:GPU:1',
        '/job:tpu_worker/task:1/device:GPU:1',
        '/job:tpu_worker/task:2/device:GPU:1',
        '/job:tpu_worker/task:3/device:GPU:1',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'XLA', 1024, 0) for name in device_names
    ]

    device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores(
        device_list)
    self.assertEqual(num_cores, 0)
    self.assertEqual(device_dict, {})
    def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices,
                                              mock_eager_list_devices):
        device_names = [
            "/job:worker/task:0/device:TPU:0",
            "/job:worker/task:0/device:TPU:1",
            "/job:worker/task:0/device:TPU:2",
            "/job:worker/task:0/device:TPU:3",
            "/job:worker/task:0/device:GPU:0",
            "/job:worker/task:0/device:GPU:1",
            "/job:worker/task:0/device:GPU:2",
            "/job:worker/task:0/device:GPU:3",
        ]
        device_list = [
            session._DeviceAttributes(name, name[26:29], 1024, 0)
            for name in device_names
        ]
        mock_eager_list_devices.return_value = device_names
        mock_list_devices.return_value = device_list

        resolver = MockBaseClusterResolver()
        self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
  def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices,
                                            mock_eager_list_devices):
    device_names = [
        "/job:worker/task:0/device:TPU:0",
        "/job:worker/task:0/device:TPU:1",
        "/job:worker/task:0/device:TPU:2",
        "/job:worker/task:0/device:TPU:3",
        "/job:worker/task:0/device:GPU:0",
        "/job:worker/task:0/device:GPU:1",
        "/job:worker/task:0/device:GPU:2",
        "/job:worker/task:0/device:GPU:3",
    ]
    device_list = [
        session._DeviceAttributes(name, name[26:29], 1024, 0)
        for name in device_names
    ]
    mock_eager_list_devices.return_value = device_names
    mock_list_devices.return_value = device_list

    resolver = MockBaseClusterResolver()
    self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
  def testGetDeviceDictAndCoresWithTPUs(self):
    device_names = [
        '/job:tpu_worker/task:0/device:TPU:0',
        '/job:tpu_worker/task:1/device:TPU:1',
        '/job:tpu_worker/task:2/device:TPU:0',
        '/job:tpu_worker/task:3/device:TPU:1',
        '/job:tpu_worker/task:0/device:TPU:4',
        '/job:tpu_worker/task:1/device:TPU:5',
        '/job:tpu_worker/task:2/device:TPU:4',
        '/job:tpu_worker/task:3/device:TPU:5',
    ]
    device_list = [
        session._DeviceAttributes(
            name, 'TPU', 1024, 0) for name in device_names
    ]

    device_details = tpu_cluster_resolver._get_device_dict_and_cores(
        device_list)
    self.assertEqual(device_details.total_cores, 8)
    self.assertEqual(device_details.device_map,
                     {'0': ['0', '4'],
                      '1': ['1', '5'],
                      '2': ['0', '4'],
                      '3': ['1', '5']})
Esempio n. 19
0
def _query_tpu_system_metadata(master_address,
                               cluster_def=None,
                               query_topology=False):
    """Automatically detects the TPU system metadata in the system."""
    tpu_core_count = 0
    devices = []
    device_dict = collections.defaultdict(list)

    if context.executing_eagerly():
        device_names = context.list_devices()
        devices = []

        # We want the output type to match in both eager and session mode
        for name in device_names:
            device_match = _DEVICE_TYPE_REGEX.match(name)
            device_type = 'CPU'
            if device_match:
                device_type = device_match.group(1)
            devices.append(
                session_lib._DeviceAttributes(name, device_type, 0, 0))  # pylint: disable=protected-access
    else:
        # TODO(b/120564445): Replace with standard library for retries.
        retry_count = 1
        while True:
            logging.info(
                'Querying Tensorflow master (%s) for TPU system metadata.',
                master_address)
            try:
                with ops.Graph().as_default():
                    with session_lib.Session(
                            master_address,
                            config=get_session_config_with_timeout(
                                _PINGING_MASTER_TIMEOUT_IN_MS,
                                cluster_def)) as sess:
                        devices = sess.list_devices()
                        break
            except errors.DeadlineExceededError:
                msg = (
                    'Failed to connect to the Tensorflow master. The TPU worker may '
                    'not be ready (still scheduling) or the Tensorflow master '
                    'address is incorrect: got (%s).' % (master_address))

                # TODO(xiejw): For local or grpc master we might not need retry logic
                # here.
                if retry_count <= _RETRY_TIMES:
                    logging.warning('%s', msg)
                    logging.warning('Retrying (%d/%d).', retry_count,
                                    _RETRY_TIMES)
                    retry_count += 1
                else:
                    raise ValueError(msg)

    for device in devices:
        match = _TPU_DEVICE_REG.match(device.name)
        if match:
            host_id = match.group(1)
            core_id = match.group(2)
            device_dict[host_id].append(core_id)
            tpu_core_count += 1

    num_of_cores_per_host = 0
    if tpu_core_count:
        num_cores_per_host_set = set(
            [len(core_ids) for core_ids in device_dict.values()])
        if len(num_cores_per_host_set) != 1:
            raise RuntimeError(
                'TPU cores on each host is not same. This should not happen!. '
                'devices: {}'.format(devices))
        num_of_cores_per_host = num_cores_per_host_set.pop()

    topology = None
    if query_topology:
        if not tpu_core_count:
            raise RuntimeError(
                'Cannot find any TPU cores in the system (master address {}). '
                'This usually means the master address is incorrect or the '
                'TPU worker has some problems. Available devices: {}'.format(
                    master_address, devices))

        topology = _obtain_topology(master_address, cluster_def)

    # We sort the metadata devices so that downstream users get a sorted list
    # for creating mirrored variables correctly.
    def _sort_key(device):
        spec = tf_device.DeviceSpec.from_string(device.name)
        return (spec.job, spec.replica, spec.task, spec.device_type,
                spec.device_index)

    devices = tuple(sorted(devices, key=_sort_key))

    metadata = _TPUSystemMetadata(num_cores=tpu_core_count,
                                  num_hosts=len(device_dict),
                                  num_of_cores_per_host=num_of_cores_per_host,
                                  topology=topology,
                                  devices=devices)

    if tpu_core_count:
        logging.info('Found TPU system:')
        logging.info('*** Num TPU Cores: %d', metadata.num_cores)
        logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
        logging.info('*** Num TPU Cores Per Worker: %d',
                     metadata.num_of_cores_per_host)
        for device in metadata.devices:
            logging.info('*** Available Device: %s', device)
    else:
        logging.info('Failed to find TPU: %s', metadata)
    return metadata