Example #1
0
def group_hierarchical_nccl_all_reduce(ts):
    names = [t.name for t in ts if t is not None]

    def reduce_op_name(name):
        return 'reduce_' + name

    def bcast_op_name(name):
        return 'bcast_' + name

    reduce_names = map_maybe(lambda t: reduce_op_name(t.name), ts)
    bcast_names = map_maybe(lambda t: bcast_op_name(t.name), ts)

    def all_reduce(args):
        i, t = args
        return _scheduled_hierarchical_nccl_all_reduce(
            t, op_names=[reduce_names[i], bcast_names[i]])

    t_names = list(sorted(names))
    all_op_names = list([reduce_op_name(name) for name in t_names] +
                        [bcast_op_name(name) for name in t_names])

    with tf.control_dependencies([
            _start_nccl_scheduler(all_op_names, scope='local'),
    ]):
        return map_maybe(all_reduce, enumerate(ts))
Example #2
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = self._group_all_reduce_fn(
                    [fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = self._group_all_reduce_fn(gradients)
        else:
            if self._monitor:
                summed_gradients = map_maybe(lambda g: monitored_all_reduce(g, []), gradients)
                # with tf.control_dependencies(summed_gradients):
                #     return calc_stats()
            else:
                summed_gradients = self._group_all_reduce_fn(gradients)

        np = tf.cast(self._num_workers, tf.float32)
        reduced_grads = map_maybe(lambda g: g / np, summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
Example #3
0
def group_nccl_all_reduce(ts):
    """Create a list of all_reduce operators for given tensor list, using NCCL."""
    names = [t.name for t in ts if t is not None]
    if len(names) == 1:
        return map_maybe(_nccl_all_reduce, ts)  # exactly one of ts is not None
    else:
        names = list(sorted(names))
        with tf.control_dependencies([
                _start_nccl_scheduler(names, scope='global'),
        ]):
            return map_maybe(_scheduled_nccl_all_reduce, ts)
Example #4
0
def group_nccl_all_reduce(ts):
    """Create a list of all_reduce operators for given tensor list, using NCCL."""
    names = [t.name for t in ts if t is not None]
    if len(names) == 1:
        return map_maybe(_nccl_all_reduce, ts)  # exactly one of ts is not None
    else:
        print("WARNING: Please fuse %d tensors before using NCCL." %
              len(names))
        names = list(sorted(names))  # FIXME: use topsort
        import tensorflow as tf
        with tf.control_dependencies([
                _start_nccl_scheduler(names),
        ]):
            return map_maybe(_scheduled_nccl_all_reduce, ts)
Example #5
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))
        # logging.info("apply gradients is called here------------")
        if self._reshape_strategy:
            # logging.info("reshape on")
            reshape_strategy(1)
        else:
            # logging.info("reshape called with int 0")
            reshape_strategy(0)

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
    def _ssgd(self, apply_grads_func, gradients, variables, **kwargs):
        sum_grads = group_all_reduce(gradients)
        avg_grads = map_maybe(lambda g: g / self._num_workers, sum_grads)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        grads_and_vars = zip(avg_grads, variables)

        return apply_grads_func(grads_and_vars, **kwargs)
Example #7
0
 def _monitor(self, grads, reduced_grads):
     square_grads = [tf.square(g) for g in grads]
     summed_square_grads = group_all_reduce(square_grads)
     reduced_square_grads = map_maybe(lambda g: g / self._num_workers,
                                      summed_square_grads)
     grad_variances = [
         square_grad - tf.square(grad)
         for square_grad, grad in zip(reduced_square_grads, reduced_grads)
     ]
     variances = [
         tf.norm(grad_variance) for grad_variance in grad_variances
     ]
     summed_variance = tf.reduce_sum(variances)
     return tf.print('Variance:', summed_variance)
Example #8
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        grads, variables = list(zip(*grads_and_vars))

        # Synchronization logic
        summed_grads = group_all_reduce(grads)
        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_grads)

        # Monitoring logic
        monitor_grads_op = tf.cond(
            tf.equal(tf.mod(self._step, self._interval), 0),
            lambda: self._monitor(grads, reduced_grads), lambda: tf.no_op())

        with tf.control_dependencies([monitor_grads_op]):
            return apply_grads_func(zip(reduced_grads, variables), **kwargs)
Example #9
0
    def apply_gradients(self, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))

        if self._nccl:
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_gradients)
        reduced_grads_and_vars = zip(reduced_grads, variables)
        return self._optimizer.apply_gradients(reduced_grads_and_vars,
                                               **kwargs)
Example #10
0
def group_all_reduce(ts):
    """Create a list of all_reduce operators for given tensor list."""
    return map_maybe(all_reduce, ts)