Exemple #1
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = self._group_all_reduce_fn(
                    [fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = self._group_all_reduce_fn(gradients)
        else:
            if self._monitor:
                summed_gradients = map_maybe(lambda g: monitored_all_reduce(g, []), gradients)
                # with tf.control_dependencies(summed_gradients):
                #     return calc_stats()
            else:
                summed_gradients = self._group_all_reduce_fn(gradients)

        np = tf.cast(self._num_workers, tf.float32)
        reduced_grads = map_maybe(lambda g: g / np, summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
Exemple #2
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))
        # logging.info("apply gradients is called here------------")
        if self._reshape_strategy:
            # logging.info("reshape on")
            reshape_strategy(1)
        else:
            # logging.info("reshape called with int 0")
            reshape_strategy(0)

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
 def _build_request_and_save_ops(self, target, variables):
     var_fused = fuse(variables)
     save_model_op = save_variable(var_fused)
     other_peer_var_fused = request_variable_with_template(
         target, var_fused)
     other_peer_vars = defuse(other_peer_var_fused,
                              [v.shape for v in variables])
     self._save_model_op = save_model_op  # save for _get_initializer_op
     return other_peer_vars, save_model_op
Exemple #4
0
 def _build_request_ops(self, target, variables):
     if self._fuse_requests:
         var_fused = fuse(variables)
         other_peer_var_fused = request_variable(
             target,
             version=None,
             name=self._fused_model_name,
             shape=var_fused.shape,
             dtype=var_fused.dtype)
         return defuse(other_peer_var_fused, [v.shape for v in variables])
     else:
         return [
             request_variable_with_template(target, v) for v in variables
         ]