def _reduce(self, aggregation, value, destinations):
        graph = ops.get_default_graph()
        cf_context = graph._get_control_flow_context()  # pylint: disable=protected-access
        # If we're inside the ReplicateContext, reduction should be done using
        # CrossReplicaSum while outside we can directly use an add_n op.
        while cf_context:
            if isinstance(cf_context, tpu.TPUReplicateContext):
                if aggregation == vs.VariableAggregation.MEAN:
                    # TODO(jhseu):  Revisit once we support model-parallelism.
                    value *= (1. / self.num_towers)
                return tpu_ops.cross_replica_sum(value)
            cf_context = cf_context.outer_context

        # Validate that the destination is same as the host device
        # Note we don't do this when in replicate context as the reduction is
        # performed on the TPU device itself.
        devices = cross_tower_ops_lib.get_devices_from(destinations)
        if len(devices) == 1:
            assert device_util.canonicalize(
                devices[0]) == device_util.canonicalize(self._host)
        else:
            raise ValueError(
                'Multiple devices are not supported for TPUStrategy')

        output = math_ops.add_n(value)
        if aggregation == vs.VariableAggregation.MEAN:
            return output * (1. / len(value))
        return output
Exemple #2
0
    def __call__(self, op):
        if get_tf_version_tuple() >= (1, 8):
            from tensorflow.python.training.device_util import canonicalize
        else:

            def canonicalize(name):  # tensorflow/tensorflow#11484
                return tf.DeviceSpec.from_string(name).to_string()

        if op.device:
            return op.device
        if op.type not in ['Variable', 'VariableV2']:
            return canonicalize(self.worker_device)

        device_index, _ = min(enumerate(self.ps_sizes),
                              key=operator.itemgetter(1))
        device_name = self.ps_devices[device_index]
        var_size = op.outputs[0].get_shape().num_elements()
        if var_size is None:
            logger.warn(
                "[LeastLoadedDeviceSetter] Shape of variable {} is not fully defined!"
                .format(op.name))
            var_size = 0

        self.ps_sizes[device_index] += var_size

        return canonicalize(device_name)
Exemple #3
0
  def _reduce(self, aggregation, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if aggregation == vs.VariableAggregation.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self.num_towers)
      elif aggregation != vs.VariableAggregation.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_tower_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self.get_host_cpu_device(0))
    else:
      raise ValueError('Multiple devices are not supported for TPUStrategy')

    if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
      return value[0]
    output = math_ops.add_n(value)
    if aggregation == vs.VariableAggregation.MEAN:
      return output * (1. / len(value))
    return output
  def _reduce(self, aggregation, value, destinations):
    graph = ops.get_default_graph()
    cf_context = graph._get_control_flow_context()  # pylint: disable=protected-access
    # If we're inside the ReplicateContext, reduction should be done using
    # CrossReplicaSum while outside we can directly use an add_n op.
    while cf_context:
      if isinstance(cf_context, tpu.TPUReplicateContext):
        if aggregation == vs.VariableAggregation.MEAN:
          # TODO(jhseu):  Revisit once we support model-parallelism.
          value *= (1. / self.num_towers)
        return tpu_ops.cross_replica_sum(value)
      cf_context = cf_context.outer_context

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_tower_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self._host)
    else:
      raise ValueError('Multiple devices are not supported for TPUStrategy')

    output = math_ops.add_n(value)
    if aggregation == vs.VariableAggregation.MEAN:
      return output * (1. / len(value))
    return output
Exemple #5
0
    def _reduce(self, aggregation, value, destinations):
        if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
            if aggregation == vs.VariableAggregation.MEAN:
                # TODO(jhseu):  Revisit once we support model-parallelism.
                value *= (1. / self.num_towers)
            elif aggregation != vs.VariableAggregation.SUM:
                raise NotImplementedError(
                    "Currently only support sum & mean in TPUStrategy.")
            return tpu_ops.cross_replica_sum(value)

        # Validate that the destination is same as the host device
        # Note we don't do this when in replicate context as the reduction is
        # performed on the TPU device itself.
        devices = cross_tower_ops_lib.get_devices_from(destinations)
        if len(devices) == 1:
            assert device_util.canonicalize(
                devices[0]) == device_util.canonicalize(
                    self.get_host_cpu_device(0))
        else:
            raise ValueError(
                'Multiple devices are not supported for TPUStrategy')

        if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
            return value[0]
        output = math_ops.add_n(value)
        if aggregation == vs.VariableAggregation.MEAN:
            return output * (1. / len(value))
        return output
 def testCanonicalizeWithoutDefaultDevice(self):
     self.assertEqual(device_util.canonicalize("/cpu:0"),
                      "/replica:0/task:0/device:CPU:0")
     self.assertEqual(device_util.canonicalize("/job:worker/cpu:0"),
                      "/job:worker/replica:0/task:0/device:CPU:0")
     self.assertEqual(device_util.canonicalize("/job:worker/task:1/cpu:0"),
                      "/job:worker/replica:0/task:1/device:CPU:0")
Exemple #7
0
    def _reduce_to(self, reduce_op, value, destinations):
        if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
            if reduce_op == reduce_util.ReduceOp.MEAN:
                # TODO(jhseu):  Revisit once we support model-parallelism.
                value *= (1. / self._num_replicas_in_sync)
            elif reduce_op != reduce_util.ReduceOp.SUM:
                raise NotImplementedError(
                    "Currently only support sum & mean in TPUStrategy.")
            return tpu_ops.cross_replica_sum(value)

        # Validate that the destination is same as the host device
        # Note we don't do this when in replicate context as the reduction is
        # performed on the TPU device itself.
        devices = cross_device_ops_lib.get_devices_from(destinations)
        if len(devices) == 1:
            assert device_util.canonicalize(
                devices[0]) == device_util.canonicalize(self._host_device)
        else:
            raise ValueError(
                "Multiple devices are not supported for TPUStrategy")

        output = math_ops.add_n(value)
        if reduce_op == reduce_util.ReduceOp.MEAN:
            return output * (1. / len(value))
        return output
def _get_devices_from(destinations):
  if isinstance(destinations, value_lib.DistributedValues):
    return list(destinations.devices)
  elif isinstance(destinations, six.string_types):
    return [device_util.canonicalize(destinations)]
  else:
    return [
        device_util.canonicalize(destination) for destination in destinations
    ]
Exemple #9
0
def _get_devices_from(destinations):
  if isinstance(destinations, value_lib.DistributedValues):
    return list(destinations.devices)
  elif isinstance(destinations, six.string_types):
    return [device_util.canonicalize(destinations)]
  else:
    return [
        device_util.canonicalize(destination) for destination in destinations
    ]
 def testCanonicalizeWithDefaultDevice(self):
   self.assertEqual(
       device_util.canonicalize("/job:worker/task:1/cpu:0", default="/gpu:0"),
       "/job:worker/replica:0/task:1/device:CPU:0")
   self.assertEqual(
       device_util.canonicalize("/job:worker/task:1", default="/gpu:0"),
       "/job:worker/replica:0/task:1/device:GPU:0")
   self.assertEqual(
       device_util.canonicalize("/cpu:0", default="/job:worker"),
       "/job:worker/replica:0/task:0/device:CPU:0")
 def testCanonicalizeWithoutDefaultDevice(self):
   self.assertEqual(
       device_util.canonicalize("/cpu:0"),
       "/job:localhost/replica:0/task:0/device:CPU:0")
   self.assertEqual(
       device_util.canonicalize("/job:worker/cpu:0"),
       "/job:worker/replica:0/task:0/device:CPU:0")
   self.assertEqual(
       device_util.canonicalize("/job:worker/task:1/cpu:0"),
       "/job:worker/replica:0/task:1/device:CPU:0")
Exemple #12
0
    def __init__(self,
                 devices=None,
                 num_gpus=None,
                 cross_tower_ops=None,
                 prefetch_on_device=None):
        super(MirroredStrategy, self).__init__()
        # Convert `num_gpus` into `devices`, shouldn't specify both.
        if devices is None:
            if num_gpus is None:
                num_gpus = context.num_gpus()
            devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
        elif num_gpus is not None:
            raise ValueError(
                "Must only specify one of `devices` and `num_gpus`.")

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = devices
        self._canonical_device_set = set(
            [device_util.canonicalize(d) for d in devices])
        self._device_index = values.PerDevice(
            dict((d, i) for i, d in enumerate(devices)))
        self._cross_tower_ops = cross_tower_ops
        self._prefetch_on_device = prefetch_on_device
  def __init__(self,
               devices=None,
               num_gpus=None,
               cross_tower_ops=None,
               prefetch_on_device=None):
    super(MirroredStrategy, self).__init__()
    # Convert `num_gpus` into `devices`, shouldn't specify both.
    if devices is None:
      if num_gpus is None:
        num_gpus = context.num_gpus()
      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
    elif num_gpus is not None:
      raise ValueError("Must only specify one of `devices` and `num_gpus`.")

    assert devices, "Must specify at least one device."
    assert len(set(devices)) == len(devices), (
        "No duplicates allowed in `devices` argument.")
    # TODO(josh11b): Require at least 2 devices?
    self._devices = devices
    self._canonical_device_set = set(
        [device_util.canonicalize(d) for d in devices])
    self._device_index = values.PerDevice(
        dict((d, i) for i, d in enumerate(devices)))
    self._cross_tower_ops = cross_tower_ops
    self._prefetch_on_device = prefetch_on_device
    def _initialize_multi_worker(self, num_gpus, cluster_spec):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._cluster_spec = cluster_spec

        self._workers = []
        for job in ["chief", "worker"]:
            for task in range(len(cluster_spec.as_dict().get(job, []))):
                self._workers.append("/job:%s/task:%d" % (job, task))

        if num_gpus is None:
            raise ValueError(
                "`num_gpus` is required if `cluster_spec` is given.")
        if num_gpus > 0:
            self._worker_device_map = {
                worker: [
                    device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
                    for gpu in range(num_gpus)
                ]
                for worker in self._workers
            }
        else:
            self._worker_device_map = {
                worker: [device_util.canonicalize(worker, "/device:CPU:0")]
                for worker in self._workers
            }

        devices = nest.flatten(self._worker_device_map)

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
        self._default_device = self._workers[0]

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerDevice(
            {d: i
             for i, d in enumerate(devices)})
  def _initialize_multi_worker(self, num_gpus, cluster_spec):
    """Initializes the object for multi-worker training."""
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._cluster_spec = cluster_spec

    self._workers = []
    for job in ["chief", "worker"]:
      for task in range(len(cluster_spec.as_dict().get(job, []))):
        self._workers.append("/job:%s/task:%d" % (job, task))

    if num_gpus is None:
      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
    if num_gpus > 0:
      self._worker_device_map = {
          worker: [
              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
              for gpu in range(num_gpus)
          ] for worker in self._workers
      }
    else:
      self._worker_device_map = {
          worker: [device_util.canonicalize(worker, "/device:CPU:0")]
          for worker in self._workers
      }

    devices = nest.flatten(self._worker_device_map)

    # Setting `_default_device` will add a device scope in the
    # distribution.scope. We set the default device to the first worker. When
    # users specify device under distribution.scope by
    #   with tf.device("/cpu:0"):
    #     ...
    # their ops will end up on the cpu device of its first worker, e.g.
    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
    self._default_device = self._workers[0]

    assert devices, "Must specify at least one device."
    assert len(set(devices)) == len(devices), (
        "No duplicates allowed in `devices` argument.")
    # TODO(josh11b): Require at least 2 devices?
    self._devices = [device_util.resolve(d) for d in devices]
    self._canonical_device_set = set(self._devices)
    self._device_index = values.PerDevice(
        {d: i for i, d in enumerate(devices)})
Exemple #16
0
 def _is_per_device(self, result, expected, klass=values.PerDevice):
   self.assertIsInstance(result, klass)
   # We canonicalize the devices to match the device strings returned
   # by PerDevice, which also does device string canonicalization.
   devices = [device_util.canonicalize(_device_str(i))
              for i in range(len(expected))]
   self.assertEqual(set(devices), set(result.devices))
   for i, d in enumerate(devices):
     self.assertEqual(expected[i], result.get(d))
     self.assertEqual(expected[i], result.get(_device_str(i)))
Exemple #17
0
 def _is_per_device(self, result, expected, klass=values.PerDevice):
   self.assertIsInstance(result, klass)
   # We canonicalize the devices to match the device strings returned
   # by PerDevice, which also does device string canonicalization.
   devices = [device_util.canonicalize(_device_str(i))
              for i in range(len(expected))]
   self.assertEqual(set(devices), set(result.devices))
   for i, d in enumerate(devices):
     self.assertEqual(expected[i], result.get(d))
     self.assertEqual(expected[i], result.get(_device_str(i)))
Exemple #18
0
def choose_the_best(devices, session_config=None):
    """Find the best subclass of CrossTowerOps given a tensorflow session.

  Args:
    devices: a list of devices passed for distribute strategy.
    session_config: a tensorflow session config or None. If None, it will make
      deciesion based on all local devices.

  Returns:
    a subclass of CrossTowerOps.
  """
    requested_devices = set([device_util.canonicalize(d) for d in devices])
    machine_devices = device_lib.list_local_devices(
        session_config=session_config)
    using_devices = []
    for d in machine_devices:
        if device_util.canonicalize(d.name) in requested_devices:
            using_devices.append(d)
        else:
            logging.info(
                "Device is available but not used by distribute strategy: %s",
                d.name)

    if len(using_devices) != len(requested_devices):
        logging.warning(
            "Not all devices in distribute strategy are visible by "
            "TensorFlow sessions.")
        return ReductionToOneDeviceCrossTowerOps()

    if any([d.device_type.lower() != "gpu" for d in using_devices]):
        logging.warning(
            "Not all devices in DistributionStrategy are visible to "
            "TensorFlow session.")
        return ReductionToOneDeviceCrossTowerOps()

    device_links = [[] for _ in range(len(using_devices))]
    for i, device in enumerate(using_devices):
        for link in device.locality.links.link:
            device_links[i].append(link.device_id)

    return _choose_all_reduce_algorithm(device_links)
Exemple #19
0
  def _call_and_check(self, model_fn, inputs, expected_result, defuns,
                      two_variables=False):
    cpu_dev = device_util.canonicalize("CPU:0")
    gpu_dev = device_util.canonicalize("GPU:0")
    devices = [cpu_dev, gpu_dev]
    dist = mirrored_strategy.MirroredStrategy(devices)

    with dist.scope():
      mock_model = MockModel(two_variables)
      self.evaluate(variables.global_variables_initializer())

      result = dist.call_for_each_tower(model_fn, mock_model, *inputs,
                                        run_concurrently=False)
      for device in devices:
        device_result = values.select_device(device, result)
        device_expected_result = values.select_device(device, expected_result)
        self.assertAllClose(device_expected_result,
                            self.evaluate(device_result))

      for defun in defuns:
        self.assertEqual(set(mock_model.variables), set(defun.variables))
  def _call_and_check(self, model_fn, inputs, expected_result, defuns,
                      two_variables=False):
    cpu_dev = device_util.canonicalize("CPU:0")
    gpu_dev = device_util.canonicalize("GPU:0")
    devices = [cpu_dev, gpu_dev]
    dist = mirrored_strategy.MirroredStrategy(devices)

    with dist.scope():
      mock_model = MockModel(two_variables)
      self.evaluate(variables.global_variables_initializer())

      result = dist.call_for_each_tower(model_fn, mock_model, *inputs,
                                        run_concurrently=False)
      for device in devices:
        device_result = values.select_device(device, result)
        device_expected_result = values.select_device(device, expected_result)
        self.assertAllClose(device_expected_result,
                            self.evaluate(device_result))

      for defun in defuns:
        self.assertEqual(set(mock_model.variables), set(defun.variables))
Exemple #21
0
    def __call__(self, op):
        if get_tf_version_tuple() >= (1, 8):
            from tensorflow.python.training.device_util import canonicalize
        else:
            def canonicalize(name):    # tensorflow/tensorflow#11484
                return tf.DeviceSpec.from_string(name).to_string()

        if op.device:
            return op.device
        if op.type not in ['Variable', 'VariableV2']:
            return canonicalize(self.worker_device)

        device_index, _ = min(enumerate(
            self.ps_sizes), key=operator.itemgetter(1))
        device_name = self.ps_devices[device_index]
        var_size = op.outputs[0].get_shape().num_elements()
        if var_size is None:
            logger.warn("[LeastLoadedDeviceSetter] Shape of variable {} is not fully defined!".format(op.name))
            var_size = 0

        self.ps_sizes[device_index] += var_size

        return canonicalize(device_name)
def choose_the_best(devices, session_config=None):
  """Find the best subclass of CrossTowerOps given a tensorflow session.

  Args:
    devices: a list of devices passed for distribute strategy.
    session_config: a tensorflow session config or None. If None, it will make
      deciesion based on all local devices.

  Returns:
    a subclass of CrossTowerOps.
  """
  requested_devices = set([device_util.canonicalize(d) for d in devices])
  machine_devices = device_lib.list_local_devices(session_config=session_config)
  using_devices = []
  for d in machine_devices:
    if device_util.canonicalize(d.name) in requested_devices:
      using_devices.append(d)
    else:
      logging.info(
          "Device is available but not used by distribute strategy: %s", d.name)

  if len(using_devices) != len(requested_devices):
    logging.warning("Not all devices in distribute strategy are visible by "
                    "TensorFlow sessions.")
    return ReductionToOneDeviceCrossTowerOps()

  if any([d.device_type.lower() != "gpu" for d in using_devices]):
    logging.warning("Not all devices in DistributionStrategy are visible to "
                    "TensorFlow session.")
    return ReductionToOneDeviceCrossTowerOps()

  device_links = [[] for _ in range(len(using_devices))]
  for i, device in enumerate(using_devices):
    for link in device.locality.links.link:
      device_links[i].append(link.device_id)

  return _choose_all_reduce_algorithm(device_links)
  def _reduce_to(self, reduce_op, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if reduce_op == reduce_util.ReduceOp.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self._num_replicas_in_sync)
      elif reduce_op != reduce_util.ReduceOp.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_device_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self._host_device)
    else:
      raise ValueError("Multiple devices are not supported for TPUStrategy")

    output = math_ops.add_n(value)
    if reduce_op == reduce_util.ReduceOp.MEAN:
      return output * (1. / len(value))
    return output
Exemple #24
0
 def get(self, device=None):
   """Returns the value for the current device or raises a ValueError."""
   if device is None:
     tower_context = distribute_lib.get_tower_context()
     if tower_context:
       device = tower_context.device
     else:
       device = distribute_lib.get_update_device()
       if device is None:
         device = device_util.current()
   device = device_util.canonicalize(device)
   try:
     return self._index[device]
   except KeyError:
     raise ValueError("Device %s not found in %s (current device %s)" %
                      (device, self._index.keys(), device_util.current()))
Exemple #25
0
 def on_device(self, device):
     device = device_util.canonicalize(device)
     return device in self._index
Exemple #26
0
 def _get_cross_tower(self):
   device = device_util.canonicalize(device_util.current())
   if device in self._index:
     return self._index[device]
   return list(self._index.values())[0]
Exemple #27
0
 def _get_cross_tower(self):
     device = device_util.canonicalize(device_util.current())
     if device in self._index:
         return self._index[device]
     return list(self._index.values())[0]
Exemple #28
0
 def on_device(self, device):
   device = device_util.canonicalize(device)
   return device in self._index
Exemple #29
0
 def __init__(self, index):
   self._index = {device_util.canonicalize(key): value
                  for key, value in six.iteritems(index)}
    def __init__(self,
                 devices=None,
                 num_gpus=None,
                 cluster_spec=None,
                 cross_tower_ops=None,
                 prefetch_on_device=None):
        super(MirroredStrategy, self).__init__()

        if cluster_spec:
            if devices is not None:
                raise ValueError(
                    "Specifying devices when `cluster_spec` is also given "
                    "is not supported in MirroredStrategy.")

            # TODO(yuefengz): use the utility method to normalize cluster_spec.
            if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
                cluster_spec = server_lib.ClusterSpec(cluster_spec)
            elif not isinstance(cluster_spec, server_lib.ClusterSpec):
                raise ValueError(
                    "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
                    "`tf.train.ClusterDef` object")
            self._cluster_spec = cluster_spec

            self._workers = []
            for job in sorted(cluster_spec.jobs):
                for task in range(cluster_spec.num_tasks(job)):
                    self._workers.append("/job:%s/task:%d" % (job, task))

            if num_gpus is None:
                raise ValueError(
                    "`num_gpus` is required if `cluster_spec` is given.")
            self._num_gpus = num_gpus
            if num_gpus > 0:
                self._worker_device_map = {
                    worker: [
                        device_util.canonicalize(worker +
                                                 "/device:GPU:%d" % gpu)
                        for gpu in range(num_gpus)
                    ]
                    for worker in self._workers
                }
            else:
                self._worker_device_map = {
                    worker:
                    [device_util.canonicalize(worker, "/device:CPU:0")]
                    for worker in self._workers
                }
            devices = nest.flatten(self._worker_device_map)

            # Setting `_default_device` will add a device scope in the
            # distribution.scope. We set the default device to the first worker. When
            # users specify device under distribution.scope by
            #   with tf.device("/cpu:0"):
            #     ...
            # their ops will end up on the cpu device of its first worker, e.g.
            # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
            self._default_device = self._workers[0]
        else:
            self._cluster_spec = None
            # Convert `num_gpus` into `devices`, shouldn't specify both.
            if devices is None:
                if num_gpus is None:
                    num_gpus = context.num_gpus()
                devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
            elif num_gpus is not None:
                raise ValueError(
                    "Must only specify one of `devices` and `num_gpus`.")
            # TODO(yuefengz): consider setting the default device.

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerDevice(
            {d: i
             for i, d in enumerate(devices)})
        self._cross_tower_ops = cross_tower_ops
        self._prefetch_on_device = prefetch_on_device
Exemple #31
0
  def __init__(self,
               num_gpus_per_worker=1,
               worker_job_name=None,
               num_workers=None,
               cluster=None,
               cross_tower_ops=None,
               prefetch_on_device=None):
    """Initialize the strategy object.

    Args:
      num_gpus_per_worker: number of GPUs per work. If it is zero, the local
        CPU will be used.
      worker_job_name: the job name for `worker`, typically just 'worker'.
      num_workers: the number of workers. If it is 0, it regenerates to
        single-worker MirroredStrategy.
      cluster: a `tf.train.ClusterSpec` object or a dict that can be used to
        construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef`
        proto buffer. It is an alternative way to initialize this object.
      cross_tower_ops: the cross tower ops to use. If None, a default one will
        be used. If configure method is called, a best one for the configuration
        will be chosen.
      prefetch_on_device: a boolean to specify whether to prefetech input to
        each worker's devices.

    Raises:
      ValueError: if got an unexpected `cluster`.
    """
    if cluster is None:
      self._workers = [
          '/job:%s/task:%d' % (worker_job_name, task_index)
          for task_index in range(num_workers)
      ]
    else:
      if isinstance(cluster, (dict, cluster_pb2.ClusterDef)):
        cluster_spec = server_lib.ClusterSpec(cluster)
      elif isinstance(cluster, server_lib.ClusterSpec):
        cluster_spec = cluster
      else:
        raise ValueError(
            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
            '`tf.train.ClusterDef` object')

      self._workers = []
      for job in sorted(cluster_spec.jobs):
        for task in range(cluster_spec.num_tasks(job)):
          self._workers.append('/job:%s/task:%d' % (job, task))

    self._num_gpus_per_worker = num_gpus_per_worker
    if num_gpus_per_worker > 0:
      self._worker_device_map = {
          worker: [
              device_util.canonicalize(worker + '/device:GPU:%d' % gpu)
              for gpu in range(num_gpus_per_worker)
          ] for worker in self._workers
      }
    else:
      self._worker_device_map = {
          worker: [device_util.canonicalize(worker, '/device:CPU:0')]
          for worker in self._workers
      }
    self._devices = nest.flatten(self._worker_device_map)

    super(MultiWorkerMirroredStrategy, self).__init__(
        devices=self._devices, prefetch_on_device=prefetch_on_device)

    # Setting `_default_device` will add a device scope in the
    # distribution.scope. We set the default device to the first worker. When
    # users specify device under distribution.scope by
    #   with tf.device("/cpu:0"):
    #     ...
    # their ops will end up on the cpu device of its first worker, e.g.
    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
    self._default_device = self._workers[0]
Exemple #32
0
 def _get_cross_tower(self):
     device = device_util.canonicalize(device_util.current())
     if device in self._index:
         return array_ops.identity(self._index[device])
     return array_ops.identity(self._primary_var)
Exemple #33
0
 def __init__(self, index):
     self._index = {
         device_util.canonicalize(key): value
         for key, value in six.iteritems(index)
     }
      def model_fn():
        if 'CPU' in compute_device:
          tower_compute_device = '/device:CPU:0'
        else:
          tower_compute_device = (
              '/device:GPU:%d' %
              distribution_strategy_context.get_tower_context().tower_id)
        tower_compute_device = device_util.canonicalize(tower_compute_device)

        if 'CPU' in variable_device:
          tower_variable_device = '/device:CPU:0'
        else:
          tower_variable_device = (
              '/device:GPU:%d' %
              distribution_strategy_context.get_tower_context().tower_id)
        tower_variable_device = device_util.canonicalize(tower_variable_device)

        a = constant_op.constant(1.0)
        b = constant_op.constant(2.0)
        c = a + b
        self.assertEqual(a.device, tower_compute_device)
        self.assertEqual(b.device, tower_compute_device)
        self.assertEqual(c.device, tower_compute_device)

        # The device scope is ignored for variables but not for normal ops.
        with ops.device('/device:GPU:2'):
          x = variable_scope.get_variable(
              'x', initializer=10.0,
              aggregation=variable_scope.VariableAggregation.SUM)
          x_add = x.assign_add(c)
          e = a + c
        self.assertEqual(
            device_util.canonicalize(x.device), tower_variable_device)
        self.assertEqual(x_add.device, x.device)
        self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))

        # The colocate_vars_with can override the distribution's device.
        with d.colocate_vars_with(x):
          y = variable_scope.get_variable(
              'y', initializer=20.0,
              aggregation=variable_scope.VariableAggregation.SUM)
        # We add an identity here to avoid complaints about summing
        # non-distributed values.
        y_add = y.assign_add(array_ops.identity(x_add))
        self.assertEqual(
            device_util.canonicalize(y.device), tower_variable_device)
        self.assertEqual(y_add.device, y.device)
        self.assertEqual(y.device, x.device)

        z = variable_scope.get_variable(
            'z', initializer=10.0,
            aggregation=variable_scope.VariableAggregation.SUM)
        self.assertEqual(
            device_util.canonicalize(z.device), tower_variable_device)

        with ops.control_dependencies([y_add]):
          # We add an identity here to avoid complaints about summing
          # non-distributed values.
          z_add = z.assign_add(array_ops.identity(y))
        with ops.control_dependencies([z_add]):
          f = z + c
        self.assertEqual(f.device, tower_compute_device)

        # The device scope would merge with the default worker device.
        with ops.device('/CPU:1'):
          g = e + 1.0
        self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))

        # Ths ops.colocate_with will be ignored when defining a variale but not
        # for a normal tensor.
        with ops.colocate_with(x):
          u = variable_scope.get_variable('u', initializer=30.0)
          h = f + 1.0
        self.assertEqual(
            device_util.canonicalize(u.device), tower_variable_device)
        self.assertEqual(device_util.canonicalize(x.device), h.device)
        return y_add, z_add, f
Exemple #35
0
def _devices_match(d1, d2):
    return device_util.canonicalize(d1) == device_util.canonicalize(d2)
Exemple #36
0
def _devices_match(d1, d2):
  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
Exemple #37
0
 def _get_cross_tower(self):
   device = device_util.canonicalize(device_util.current())
   if device in self._index:
     return array_ops.identity(self._index[device])
   return array_ops.identity(self._primary_var)