Esempio n. 1
0
    def __init__(self, device_batch_size, monitor_interval=1):
        self._num_workers = current_cluster_size()
        self._step = counter()

        self._interval = monitor_interval
        self._device_batch_size = tf.cast(device_batch_size, dtype=tf.float32)
        self._global_batch_size = self._device_batch_size * self._num_workers
Esempio n. 2
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        np, rank = current_cluster_size(), current_rank()
        target = get_random_peer(np, rank)
        gradients, variables = list(zip(*grads_and_vars))

        init_store_op = tf.cond(tf.equal(self._step, 0),
                                lambda: self.init_store(variables), tf.no_op)
        with tf.control_dependencies([init_store_op]):
            other_peer_vars = self._build_request_ops(target, variables)

        save_model_op = self._build_save_op(variables)

        assign_ops = [
            _tf_assign(v, 0.5 * (v + other_v))
            for v, other_v in zip(variables, other_peer_vars)
        ]

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        new_grads_and_vars = zip(gradients, variables)
        apply_op = apply_grads_func(new_grads_and_vars, **kwargs)

        with tf.control_dependencies(assign_ops):
            with tf.control_dependencies([apply_op]):
                with tf.control_dependencies([save_model_op]):
                    return tf.group(apply_op)
Esempio n. 3
0
    def __init__(self, monitor_interval=1):
        self._num_workers = current_cluster_size()
        self._step = counter()

        self._interval = monitor_interval
        self._summed_variance = None
        self._variances = None
Esempio n. 4
0
 def __init__(self, optimizer, interval, name=None, use_locking=False):
     super(AdaptiveSGDOptimizer, self).__init__(optimizer,
                                                name,
                                                use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()
     self._step = tf.Variable(0, trainable=False, dtype=tf.int32)
     self._interval = interval
Esempio n. 5
0
 def __init__(
     self,
     reshape_strategy,
     nccl=False,
     nccl_fusion=True,
 ):
     self._reshape_strategy = reshape_strategy
     self._nccl = nccl
     self._nccl_fusion = nccl_fusion
     self._num_workers = current_cluster_size()
def log_final_result(value, error):
    from kungfu.tensorflow.ops import current_rank, current_cluster_size
    if current_rank() != 0:
        return
    attrs = {
        'np': current_cluster_size(),
        'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'),
        'bs': args.batch_size,
        'model': args.model,
        'kf-opt': args.kf_optimizer,
    }
    log_detailed_result(value, error, attrs)
Esempio n. 7
0
def test_group_all_gather():
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import all_gather
    rank = current_rank()
    np = current_cluster_size()
    sizes = [i + 1 for i in range(5)]
    xs = [(rank + 1) * tf.Variable(tf.ones([n], tf.int32)) for n in sizes]
    ys = [all_gather(x) for x in xs]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i, y in enumerate(ys):
            v = sess.run(y)
            assert (v.sum() == (np + 1) * np / 2 * (i + 1))
Esempio n. 8
0
 def after_run(self, run_context, run_values):
     self._step += 1
     np = current_cluster_size()
     self._trained_samples += self._local_batch_size * np
     changed, keep = run_context.session.run(self._resize_op)
     if not keep:
         run_context.request_stop()
         self._exit_reason = 'change cluster'
         return
     if changed:
         self._need_sync = True
     if self._trained_samples >= self._total_samples:
         self._exit_reason = 'finished'
         run_context.request_stop()
Esempio n. 9
0
def log_final_result(value, error):
    if current_rank() != 0:
        return
    attrs = {
        'framework': 'kungfu',
        'np': current_cluster_size(),
        'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'),
        'bs': args.batch_size,
        'model': args.model,
        'xla': args.xla,
        'kf-opt': args.kf_optimizer,
        'fuse': args.fuse,
        'nvlink': os.getenv('KUNGFU_ALLOW_NVLINK'),
        'data': 'disk' if args.data_dir else 'memory',
    }
    log_detailed_result(value, error, attrs)
Esempio n. 10
0
def test_consensus():
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import consensus

    np = current_cluster_size()
    rank = current_rank()

    x = tf.Variable(rank, dtype=tf.int32)
    consensus_check = consensus(x)

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(consensus_check)

        assert v == (np == 1)
Esempio n. 11
0
def test_monitored_all_reduce():
    def gen_tree(n, r):
        tree = [i for i in range(n)]
        for i in range(n):
            if i != r:
                tree[i] = r
        return tree

    from kungfu.tensorflow.ops import monitored_all_reduce, current_cluster_size
    np = current_cluster_size()
    init_tree = gen_tree(np, 0)

    tree = tf.Variable(init_tree, dtype=tf.int32)
    x = tf.Variable(tf.ones([16, 1024, 1024], dtype=tf.int64))
    y = monitored_all_reduce(x, tree)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(y)
        assert (v.sum() == np * 16 * 1024 * 1024)
Esempio n. 12
0
def all_reduce_benchmark(sizes, dtype=tf.float32):
    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = current_cluster_size()
    multiplier = 4 * (np - 1)
    print('all reduce total size: %s among %d peers' %
          (show_size(tot_size), np))
    ys = group_all_reduce(xs)
    init = tf.global_variables_initializer()
    warmup_steps = 5
    bench_steps = 10
    with tf.Session() as sess:
        sess.run(init)
        for step in range(warmup_steps):
            sess.run(ys)
        for step in range(bench_steps):
            t0 = time.time()
            sess.run(ys)
            d = time.time() - t0
            rate = 0
            print('step %d, took %.2fs, equivalent data rate: %s' %
                  (step, d, show_rate(tot_size * multiplier, d)))
Esempio n. 13
0
 def __init__(self):
     self._num_workers = current_cluster_size()
Esempio n. 14
0
def get_cluster_size(method):
    if method == 'HOROVOD':
        import horovod.tensorflow as hvd
        return hvd.size()
    else:
        return current_cluster_size()
Esempio n. 15
0
 def __init__(self, change_step, alpha):
     self._num_workers = current_cluster_size()
     self._alpha = alpha
     self._change_step = change_step
     self._global_step = tf.train.get_or_create_global_step()
Esempio n. 16
0
 def __init__(self, alpha):
     self._num_workers = current_cluster_size()
     self._alpha = alpha
Esempio n. 17
0
def show_info_example():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
Esempio n. 18
0

def restore(checkpoint):
    gs = int(checkpoint)
    return gs


new_size = tf.placeholder(tf.int32)
resize_op = resize_cluster_from_url()

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    np = current_cluster_size()

    gs_place = tf.placeholder(dtype=tf.int64, shape=())
    sync_step_op = all_reduce(gs_place, op='max')
    shoud_sync = True
    gs = 0
    while gs < max_step:
        if shoud_sync:
            new_gs = sess.run(sync_step_op, feed_dict={gs_place: gs})
            print('sync step: %d -> %d' % (gs, new_gs))
            gs = new_gs
            shoud_sync = False
        t0 = time.time()
        v = sess.run(y)
        print('step %d, result: %d, np=%d, took %s' %
              (gs, v, np, show_duration(time.time() - t0)))
Esempio n. 19
0
def fake_get_shard_info(use_kungfu):
    if use_kungfu:
        from kungfu.tensorflow.ops import current_cluster_size, current_rank
        return current_rank(), current_cluster_size()
    return 0, 1
Esempio n. 20
0
 def __init__(self, nccl=False, nccl_fusion=True):
     self._nccl = nccl
     self._nccl_fusion = nccl_fusion
     self._num_workers = current_cluster_size()