コード例 #1
0
ファイル: grad_noise_scale.py プロジェクト: zeta1999/KungFu
    def _monitor(self, grads, reduced_grads):
        # Only the master node is doing the global monitoring.
        noise_op = global_noise_scale(self._device_batch_size,
                                      self._global_batch_size, fuse(grads),
                                      fuse(reduced_grads))

        print_op = tf.print('Gradient Noise Scale:', noise_op)
        return print_op
コード例 #2
0
    def _monitor(self, grads, reduced_grads):
        self._noise_op = global_noise_scale(self._device_batch_size,
                                            self._global_batch_size,
                                            fuse(grads), fuse(reduced_grads))

        print_op = tf.print('Gradient Noise Scale:', self._noise_op)

        with tf.control_dependencies([print_op]):
            return tf.no_op()
コード例 #3
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = self._group_all_reduce_fn(
                    [fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = self._group_all_reduce_fn(gradients)
        else:
            if self._monitor:
                summed_gradients = map_maybe(lambda g: monitored_all_reduce(g, []), gradients)
                # with tf.control_dependencies(summed_gradients):
                #     return calc_stats()
            else:
                summed_gradients = self._group_all_reduce_fn(gradients)

        np = tf.cast(self._num_workers, tf.float32)
        reduced_grads = map_maybe(lambda g: g / np, summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
コード例 #4
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))
        # logging.info("apply gradients is called here------------")
        if self._reshape_strategy:
            # logging.info("reshape on")
            reshape_strategy(1)
        else:
            # logging.info("reshape called with int 0")
            reshape_strategy(0)

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
コード例 #5
0
ファイル: grad_noise_scale.py プロジェクト: zuston/KungFu
 def _monitor(self, grads, reduced_grads):
     # Only the master node is doing the global monitoring.
     noise_op = global_noise_scale(self._device_batch_size,
                                   self._global_batch_size,
                                   fuse(grads),
                                   fuse(reduced_grads),
                                   alpha=self._alpha)
     if self._verbose:
         print_op = tf.print('Gradient Noise Scale:', noise_op)
         return print_op
     else:
         gns = create_global_variable(GraphKeys.GRADIENT_NOISE_SCALE,
                                      shape=[],
                                      dtype=tf.float32)
         with tf.control_dependencies([gns.assign(noise_op)]):
             monitor_op = tf.no_op()
             return monitor_op
コード例 #6
0
 def _build_request_and_save_ops(self, target, variables):
     var_fused = fuse(variables)
     save_model_op = save_variable(var_fused)
     other_peer_var_fused = request_variable_with_template(
         target, var_fused)
     other_peer_vars = defuse(other_peer_var_fused,
                              [v.shape for v in variables])
     self._save_model_op = save_model_op  # save for _get_initializer_op
     return other_peer_vars, save_model_op
コード例 #7
0
 def _build_request_ops(self, target, variables):
     if self._fuse_requests:
         var_fused = fuse(variables)
         other_peer_var_fused = request_variable(
             target,
             version=None,
             name=self._fused_model_name,
             shape=var_fused.shape,
             dtype=var_fused.dtype)
         return defuse(other_peer_var_fused, [v.shape for v in variables])
     else:
         return [
             request_variable_with_template(target, v) for v in variables
         ]
コード例 #8
0
 def _build_save_op(self, variables):
     if self._fuse_requests:
         var_fused = fuse(variables)
         return save_variable(var_fused, name=self._fused_model_name)
     else:
         return tf.group([save_variable(v) for v in variables])