コード例 #1
0
    def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values):
        """All-reduce across all workers in a batch."""

        logging.log_first_n(
            logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
            "num_workers = %d" % (len(per_replica_values), self._num_workers),
            10)

        chunked_gv = self._make_gradient_chunks(per_replica_values,
                                                self._all_reduce_merge_scope)

        reduced_gv_list = []
        for chunk in chunked_gv:
            with ops.name_scope("allreduce"):
                for grad_and_vars in chunk:
                    # Gradients for the same variable but from different devices.
                    scaled_grads = [g for g, _ in grad_and_vars]
                    collective_reduced = cross_device_utils.build_collective_reduce(
                        scaled_grads, self._num_workers, self._collective_keys,
                        "Add", "Id")
                    result = []
                    for (_, v), g in zip(grad_and_vars, collective_reduced):
                        result.append([g, v])
                    reduced_gv_list.append(result)

        new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
        return _ungroup_and_make_mirrored(
            new_device_grads,
            per_replica_values[0],
            reduce_op,
            num_between_graph_workers=self._num_workers)
コード例 #2
0
  def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values):
    """All-reduce across all workers in a batch."""

    logging.log_first_n(
        logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)

    chunked_gv = self._make_gradient_chunks(per_replica_values,
                                            self._all_reduce_merge_scope)

    reduced_gv_list = []
    for chunk in chunked_gv:
      with ops.name_scope("allreduce"):
        for grad_and_vars in chunk:
          # Gradients for the same variable but from different devices.
          scaled_grads = [g for g, _ in grad_and_vars]
          collective_reduced = cross_device_utils.build_collective_reduce(
              scaled_grads, self._num_workers, self._collective_keys, "Add",
              "Id")
          result = []
          for (_, v), g in zip(grad_and_vars, collective_reduced):
            result.append([g, v])
          reduced_gv_list.append(result)

    new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
    return _ungroup_and_make_mirrored(
        new_device_grads,
        per_replica_values[0],
        reduce_op,
        num_between_graph_workers=self._num_workers)
コード例 #3
0
    def _batch_all_reduce(self, reduce_op, per_replica_values):
        """All-reduce across all workers in a batch."""
        if context.executing_eagerly():
            raise ValueError(
                "Eager execution with collective ops is not supported yet.")

        logging.log_first_n(
            logging.INFO,
            "Collective All-reduce invoked with batches size = %d, "
            "num_workers = %d" % (len(per_replica_values), self._num_workers),
            10)

        grouped_by_device = _group_value_by_device(per_replica_values)

        grouped_by_var = list(zip(*grouped_by_device))
        # grouped_by_var is grouped by variables and takes the following format:
        # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
        #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
        #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
        #  ...
        # ]
        chunked_gv = [
            grouped_by_var[x:x + self._all_reduce_merge_scope] for x in range(
                0, len(grouped_by_var), self._all_reduce_merge_scope)
        ]

        reduced_gv_list = []
        for chunk in chunked_gv:
            with ops.name_scope("allreduce"):
                for grad_and_vars in chunk:
                    scaled_grads = [g for g, _ in grad_and_vars]
                    collective_reduced = cross_device_utils.build_collective_reduce(
                        scaled_grads, self._num_workers, self._collective_keys,
                        "Add", "Id")
                    result = []
                    for (_, v), g in zip(grad_and_vars, collective_reduced):
                        result.append([g, v])
                    reduced_gv_list.append(result)

        new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
        return _ungroup_and_make_mirrored(
            new_device_grads,
            per_replica_values[0].devices,
            reduce_op,
            num_between_graph_workers=self._num_workers)
コード例 #4
0
  def _batch_all_reduce(self, reduce_op, per_replica_values):
    """All-reduce across all workers in a batch."""
    if context.executing_eagerly():
      raise ValueError(
          "Eager execution with collective ops is not supported yet.")

    logging.log_first_n(
        logging.INFO, "Collective All-reduce invoked with batches size = %d, "
        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)

    grouped_by_device = _group_value_by_device(per_replica_values)

    grouped_by_var = list(zip(*grouped_by_device))
    # grouped_by_var is grouped by variables and takes the following format:
    # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
    #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
    #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
    #  ...
    # ]
    chunked_gv = [
        grouped_by_var[x:x + self._all_reduce_merge_scope]
        for x in range(0, len(grouped_by_var), self._all_reduce_merge_scope)
    ]

    reduced_gv_list = []
    for chunk in chunked_gv:
      with ops.name_scope("allreduce"):
        for grad_and_vars in chunk:
          scaled_grads = [g for g, _ in grad_and_vars]
          collective_reduced = cross_device_utils.build_collective_reduce(
              scaled_grads, self._num_workers, self._collective_keys, "Add",
              "Id")
          result = []
          for (_, v), g in zip(grad_and_vars, collective_reduced):
            result.append([g, v])
          reduced_gv_list.append(result)

    new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
    return _ungroup_and_make_mirrored(
        new_device_grads,
        per_replica_values[0].devices,
        reduce_op,
        num_between_graph_workers=self._num_workers)
コード例 #5
0
    def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values,
                                   experimental_hints):
        """All-reduce across all workers in a batch."""

        batch_size = len(per_replica_values)
        # Pass self._communication to the runtime as a communication hint.
        communication = self._communication.value
        # For now, we use NCCL only when batch_size > 1.
        # TODO(b/132575814): switch to NCCL for all collectives when communication
        # is NCCL.
        if self._communication == CollectiveCommunication.NCCL and batch_size == 1:
            communication = CollectiveCommunication.AUTO.value

        # Reverse the lists so that there's better chance that values follows
        # the order in which they are calculated (e.g. when they're gradients), so
        # as to overlap calculation with communication. However, this may not be
        # optimal for cases like gradients of complicated non-sequential models.
        #
        # Note that we reverse the list before packing so that the first pack won't
        # be too small, since it's more likely for first few packs to have long
        # queuing time due to concurrent intense computation.
        #
        # TODO(b/147393503): explore solutions for optimal ordering.
        packs = cross_device_utils.pack_by_size(
            list(reversed(per_replica_values)),
            experimental_hints.bytes_per_pack)

        if batch_size > 1:
            logging.info(
                "Collective batch_all_reduce: %d all-reduces, num_devices = %d, "
                "group_size = %d, communication_hint = %s, num_packs = %d",
                batch_size, len(self._devices), self._group_size,
                communication, len(packs))
        else:
            logging.log_first_n(
                logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
                "num_devices = %d, group_size = %d, communication_hint = %s, "
                "num_packs = %d" %
                (batch_size, len(self._devices), self._group_size,
                 communication, len(packs)), 10)

        reduced_values = []
        with self._lock:
            for pack in packs:
                # By placing all CollectiveReduce ops in a pack under single name scope,
                # we ensure they will be picked up by the `ScopedAllocator` grappler
                # optimizer and packed into a single all-reduce.
                with ops.name_scope("allreduce"):
                    for per_replica in pack:
                        # Add control dependencies per device from the last gradients to the
                        # current set, in order to serialize NCCL launches.
                        if (communication == CollectiveCommunication.NCCL.value
                                and reduced_values):
                            control_inputs = list(reduced_values[-1])
                        else:
                            control_inputs = None
                        reduced_values.append(
                            cross_device_utils.build_collective_reduce(
                                per_replica.values,
                                self._devices,
                                self._group_size,
                                self._collective_keys,
                                "Add",
                                "Id",
                                communication,
                                control_inputs,
                                executors=self._executors,
                                timeout=experimental_hints.timeout_seconds))

        for e in self._executors:
            e.wait()

        mirrored = []
        # Reverse the order of reduced value to recover the order in the input.
        for value in reversed(reduced_values):
            if reduce_op == reduce_util.ReduceOp.MEAN:
                for i, v in enumerate(value):
                    with ops.device(v.device):
                        value[i] = v / self._group_size
            mirrored.append(
                distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
        return mirrored