Example #1
0
    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver,
                 steps_per_run,
                 num_cores=None):
        super(TPUExtended, self).__init__(container_strategy)
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        # TODO(sourabhbajaj): Change this from num_cores to metadata_override
        self._num_cores_override = num_cores

        # TODO(jhseu): Switch to DeviceAssignment to support pods and model
        # parallelism.
        device_map = {
            d.name: i
            for i, d in enumerate(self._tpu_metadata.devices)
            if "device:TPU:" in d.name
        }
        self._device_index = values.PerReplica(device_map)
        self._host_device = self.get_host_cpu_device(0)
        self._tpu_devices = sorted(device_map.keys())
        # Only create variables for the number of replicas we're running.
        self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run

        self._require_static_shapes = True
    def _initialize_local(self, num_gpus, devices):
        """Initializes the object for local training."""
        self._cluster_spec = None
        # Convert `num_gpus` into `devices`, shouldn't specify both.
        if devices is None:
            if num_gpus is None:
                num_gpus = context.num_gpus()
            if num_gpus == 0:
                devices = ["/device:CPU:0"]
            else:
                devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
        elif num_gpus is not None:
            raise ValueError(
                "Must only specify one of `devices` and `num_gpus`.")
        self._num_gpus = num_gpus
        # TODO(yuefengz): consider setting the default device.

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerReplica(
            {d: i
             for i, d in enumerate(devices)})
 def testContainsIndexedSlices_PerReplica(self):
     t0 = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
     self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica))
Example #4
0
 def map(self, map_over, fn, *args, **kwargs):
     # TODO(josh11b): In eager mode, use one thread per device.
     index = {}
     for i, m in enumerate(map_over):
         d = self._devices[i % len(self._devices)]
         with ops.device(d):
             l = index.get(d, [])
             l.append(
                 fn(m, *values.select_device_mirrored(d, args),
                    **values.select_device_mirrored(d, kwargs)))
             index[d] = l
     # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
     # in addition to PerReplica data.
     return values.PerReplica(
         {k: values.MapOutput(v)
          for k, v in index.items()})
Example #5
0
def _make_tensor_into_per_replica(input_tensor):
  """Converts a single tensor into a PerReplica object."""
  if isinstance(input_tensor, (tuple, list)):
    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object, "
                     "got %r but expected a object that is not a tuple or list."
                     % (input_tensor,))
  if isinstance(input_tensor, value_lib.PerReplica):
    return input_tensor

  try:
    device = input_tensor.device
  except AttributeError:
    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object "
                     "because it doesn't have device set.")

  return value_lib.PerReplica({device: input_tensor})
Example #6
0
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
    result = cross_tower_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)
Example #7
0
def _make_per_replica(values, devices, regroup=False):
    devices = cross_tower_ops_lib.get_devices_from(devices)
    assert len(values) == len(devices)

    # We simulate the result of regroup called on PerReplica which strips the
    # PerReplica wrapper if it has only one value.
    if len(values) == 1 and regroup:
        with ops.device(devices[0]):
            placed_v = array_ops.identity(values[0])
        return placed_v

    index = {}
    for d, v in zip(devices, values):
        with ops.device(d):
            placed_v = array_ops.identity(v)
        index[d] = placed_v
    return value_lib.PerReplica(index)
    def _initialize_multi_worker(self, num_gpus, cluster_spec):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._cluster_spec = cluster_spec

        self._workers = []
        for job in ["chief", "worker"]:
            for task in range(len(cluster_spec.as_dict().get(job, []))):
                self._workers.append("/job:%s/task:%d" % (job, task))

        if num_gpus is None:
            raise ValueError(
                "`num_gpus` is required if `cluster_spec` is given.")
        if num_gpus > 0:
            self._worker_devices = [(worker, [
                device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
                for gpu in range(num_gpus)
            ]) for worker in self._workers]
        else:
            self._worker_devices = [
                (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
                for worker in self._workers
            ]

        devices = nest.flatten([l for _, l in self._worker_devices])

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
        self._default_device = self._workers[0]

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerReplica(
            {d: i
             for i, d in enumerate(devices)})
Example #9
0
    def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, reduce_op,
                                   batch_reduce):
        devices = ["/cpu:0", "/gpu:0"]
        dense_shape = [5, 2]
        t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
        t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
                                  devices[1])
        per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})

        if batch_reduce:
            result = cross_tower_ops_instance.batch_reduce(
                reduce_op, [(per_replica, devices)])
        else:
            result = cross_tower_ops_instance.reduce(reduce_op, per_replica,
                                                     devices)

        total_indices_with_dups = [1, 1, 3]
        total_indices_without_dups = [1, 3]

        if reduce_op == reduce_util.ReduceOp.SUM:
            total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
            total_values_without_dups = [[4., 6.], [5., 6.]]
        else:
            assert reduce_op == reduce_util.ReduceOp.MEAN
            total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
            total_values_without_dups = [[2., 3.], [2.5, 3.]]

        total_mirrored_with_dups = _make_mirrored_indexed_slices(
            devices, total_values_with_dups, total_indices_with_dups,
            dense_shape)
        total_mirrored_without_dups = _make_mirrored_indexed_slices(
            devices, total_values_without_dups, total_indices_without_dups,
            dense_shape)

        # Test that the result is semantically equal to both the concatenated
        # IndexedSlices, as well as when the duplicate indices are summed up.
        if batch_reduce:
            total_mirrored_with_dups = [total_mirrored_with_dups]
            total_mirrored_without_dups = [total_mirrored_without_dups]

        self._assert_values_equal(total_mirrored_with_dups, result)
        self._assert_values_equal(total_mirrored_without_dups, result)
Example #10
0
    def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None):
        """Initializes the TPUStrategy object.

    Args:
      tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
          which provides information about the TPU cluster.
      steps_per_run: Number of steps to run on device before returning to the
          host. Note that this can have side-effects on performance, hooks,
          metrics, summaries etc.
          This parameter is only used when Distribution Strategy is used with
          estimator or keras.
      num_cores: Number of cores to use on the TPU. If None specified, then
          auto-detect the cores and topology of the TPU system.
    """
        super(TPUStrategy, self).__init__()

        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        # TODO(sourabhbajaj): Change this from num_cores to metadata_override
        self._num_cores_override = num_cores

        # TODO(jhseu): Switch to DeviceAssignment to support pods and model
        # parallelism.
        device_map = {
            d.name: i
            for i, d in enumerate(self._tpu_metadata.devices)
            if "device:TPU:" in d.name
        }
        self._device_index = values.PerReplica(device_map)
        self._host_device = self.get_host_cpu_device(0)
        self._tpu_devices = sorted(device_map.keys())
        # Only create variables for the number of replicas we're running.
        self._tpu_devices = self._tpu_devices[:self.num_replicas_in_sync]

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run

        self._require_static_shapes = True