Beispiel #1
0
 def __init__(self, optimizer, interval, name=None, use_locking=False):
     super(AdaptiveSGDOptimizer, self).__init__(optimizer,
                                                name,
                                                use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()
     self._step = tf.Variable(0, trainable=False, dtype=tf.int32)
     self._interval = interval
Beispiel #2
0
 def __init__(self,
              optimizer,
              nccl=False,
              nccl_fusion=True,
              name=None,
              use_locking=False):
     super(SynchronousSGDOptimizer, self).__init__(optimizer,
                                                   name,
                                                   use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._nccl = nccl
     self._nccl_fusion = nccl_fusion
Beispiel #3
0
    def __init__(self,
                 optimizer,
                 name=None,
                 monitor_interval=1,
                 use_locking=False):
        super(SyncSGDWithGradVarianceOptimizer,
              self).__init__(optimizer, name, use_locking=use_locking)
        self._num_workers = current_cluster_size()
        self._step = counter()

        self._interval = monitor_interval
        self._summed_variance = None
        self._variances = None
Beispiel #4
0
    def __init__(self,
                 optimizer,
                 device_batch_size,
                 name=None,
                 monitor_interval=1,
                 use_locking=False):
        super(SyncSGDWithGradNoiseScaleOptimizer,
              self).__init__(optimizer, name, use_locking=use_locking)
        self._num_workers = current_cluster_size()
        self._step = counter()

        self._interval = monitor_interval
        self._device_batch_size = tf.cast(device_batch_size, dtype=tf.float32)
        self._global_batch_size = self._device_batch_size * self._num_workers
        self._noise_op = None
Beispiel #5
0
    def apply_gradients(self, grads_and_vars, **kwargs):
        """Calls this same method on the underlying optimizer."""
        np, rank = current_cluster_size(), current_rank()
        target = get_random_peer(np, rank)
        variables = [v for _g, v in grads_and_vars]
        other_peer_vars, save_model_op = self._build_request_and_save_ops(
            target, variables)

        assign_ops = [
            tf.assign(v, 0.5 * (v + other_v))
            for v, other_v in zip(variables, other_peer_vars)
        ]

        apply_op = self._optimizer.apply_gradients(grads_and_vars, **kwargs)

        with tf.control_dependencies(assign_ops):
            with tf.control_dependencies([apply_op]):
                with tf.control_dependencies([save_model_op]):
                    return tf.group(apply_op)
Beispiel #6
0
def all_reduce_benchmark(sizes, dtype=tf.float32):
    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = current_cluster_size()
    multiplier = 4 * (np - 1)
    print('all reduce total size: %s among %d peers' %
          (show_size(tot_size), np))
    ys = group_all_reduce(xs)
    init = tf.global_variables_initializer()
    warmup_steps = 5
    bench_steps = 10
    with tf.Session() as sess:
        sess.run(init)
        for step in range(warmup_steps):
            sess.run(ys)
        for step in range(bench_steps):
            t0 = time.time()
            sess.run(ys)
            d = time.time() - t0
            rate = 0
            print('step %d, took %.2fs, equivalent data rate: %s' %
                  (step, d, show_rate(tot_size * multiplier, d)))
Beispiel #7
0
def fake_get_shard_info(use_kungfu):
    if use_kungfu:
        from kungfu.tensorflow.v1.ops import current_cluster_size, current_rank
        return current_rank(), current_cluster_size()
    return 0, 1
Beispiel #8
0
def show_info_example():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
Beispiel #9
0
    return gs


ckpt = tf.placeholder(tf.string)
new_size = tf.placeholder(tf.int32)
resize_op = resize_cluster(ckpt, new_size)

init = tf.global_variables_initializer()

# barrier_op = barrier()

with tf.Session() as sess:
    sess.run(init)

    init_gs = restore(get_init_checkpoint())
    np = current_cluster_size()
    init_np = get_cluster_size(init_gs, cluster_size_schedule, np)
    if np != init_np:
        print(
            '[W] init cluster size (np=%d) is not consistent with schedule (np=%d)'
            % (np, init_np))

    print('restored from %d, np=%d, init_np=%d, start took %s' %
          (init_gs, np, init_np, show_duration(time.time() - t0)))

    for gs in range(init_gs, max_step):
        t0 = time.time()
        v = sess.run(y)
        print('step %d, result: %d, np=%d, took %s' %
              (gs, v, np, show_duration(time.time() - t0)))
Beispiel #10
0
 def __init__(self, optimizer, name=None, use_locking=False):
     super(SynchronousAveragingOptimizer,
           self).__init__(optimizer, name, use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()