def run_driver():
    ray.init(redis_address=args.ip)

    worker = Worker.remote()
    ps = ParameterServer.remote()
    log = util.FileLogger('out')
    log(f"Worker ip {ray.get(worker.ip.remote())}")
    log(f"PS ip {ray.get(ps.ip.remote())}")
    log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")

    time_list = []
    for i in range(args.iters):
        start_time = time.perf_counter()
        results = []
        for layer_idx in range(args.num_layers):
            grad = worker.compute_gradient.remote(layer_idx)
            results.append(ps.receive.remote(grad, layer_idx))
        ray.wait(results)
        elapsed_time_ms = (time.perf_counter() - start_time) * 1000
        time_list.append(elapsed_time_ms)
        rate = gradient_size / (elapsed_time_ms / 1000) / 1e6
        log(f'{i:03d}/{args.iters} sent {gradient_size/1e6:.0f} MBs in '
            f'{elapsed_time_ms:.1f} ms: {rate:.2f} MB/second')

    min_ = np.min(time_list)
    median = np.median(time_list)
    mean = np.mean(time_list)

    def gbps(time_ms):
        return 8 * gradient_size / (time_ms / 1000) / 1e9

    log(f"Gbps: mean: {gbps(mean):8.2f}, median: {gbps(median):8.2f}, max: {gbps(min_):8.2f}")
Beispiel #2
0
def run_driver():
    ray.init(redis_address=args.ip)

    worker = Worker.remote()
    ps = ParameterServer.remote()
    log = util.FileLogger('out')
    log(f"Worker ip {ray.get(worker.ip.remote())}")
    log(f"PS ip {ray.get(ps.ip.remote())}")
    log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")

    time_list = []
    for i in range(args.iters):
        start_time = time.perf_counter()
        grads = worker.compute_gradients.remote()
        result = ps.receive.remote(grads)
        ray.wait([result])
        elapsed_time_ms = (time.perf_counter() - start_time) * 1000
        time_list.append(elapsed_time_ms)
        rate = args.size_mb / (elapsed_time_ms / 1000)
        log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' %
            (i, args.iters, args.size_mb, elapsed_time_ms, rate))

    min = np.min(time_list)
    median = np.median(time_list)
    log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}"
        )
Beispiel #3
0
def run_worker():
    param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers

    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    if rank == 0:
        log = util.FileLogger('/tmp/out')
        #    log = util.FileLogger('/dev/null', mirror=False)

    else:
        log = util.FileLogger('/dev/null', mirror=False)
    grads_array = []

    time_list = []
    dim = args.size_mb * 250 * 1000
    dtype = np.float32
    data = np.ones(dim, dtype=dtype) * (rank + 1)
    for i in range(args.iters):
        start_time = time.perf_counter()
        if rank == 0:
            comm.Send(data, dest=1, tag=13)
        else:
            data = np.empty(dim, dtype=dtype)
            comm.Recv(data, source=0, tag=13)

        end_time = time.perf_counter()

        elapsed_time_ms = (end_time - start_time) * 1000
        time_list.append(elapsed_time_ms)
        rate = args.size_mb / (elapsed_time_ms / 1000)
        log(f'{rank} {i:03d}/{args.iters:d} sent {args.size_mb:d} MBs in {elapsed_time_ms:.1f}'
            f' ms: {rate:.2f} MB/second')

    min = np.min(time_list)
    median = np.median(time_list)

    log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}"
        )
Beispiel #4
0
def run_sender():
  param_size = 250 * 1000 * args.size_mb // args.shards  # 1MB is 250k integers
  log = util.FileLogger('out')
  grads_array = []
  with tf.device('/job:chief/task:0'):
    #    grads = tf.fill([param_size], 1.)
    for i in range(args.shards):
      grads = tf.Variable(tf.ones([param_size]))
      grads_array.append(grads)

  params_array = []
  add_op_array = []
  with tf.device('/job:receiver/task:0'):
    for i in range(args.shards):
      params = tf.Variable(tf.ones([param_size]))
      add_op = params.assign(grads_array[i]).op
      params_array.append(params)
      add_op_array.append(add_op)
    add_op = tf.group(*add_op_array)
    
  server = _launch_server('chief')
  sess = tf.Session(server.target)
  sess.run(tf.global_variables_initializer())
    # except Exception as e:
    #   # sometimes .run fails with .UnavailableError: OS Error
    #   log(f"initialization failed with {e}, retrying in 1 second")
    #   time.sleep(1)

  time_list = []
  for i in range(args.iters):
    start_time = time.perf_counter()
    sess.run(add_op)
    elapsed_time_ms = (time.perf_counter() - start_time) * 1000
    time_list.append(elapsed_time_ms)
    rate = args.size_mb / (elapsed_time_ms / 1000)
    log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (
      i, args.iters, args.size_mb, elapsed_time_ms, rate))

  min = np.min(time_list)
  median = np.median(time_list)

  log(
    f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
Beispiel #5
0
def worker():
    """ Initialize the distributed environment. """

    import torch
    import torch.distributed as dist
    from torch.multiprocessing import Process
    import numpy as np

    print("Initializing distributed pytorch")
    os.environ['MASTER_ADDR'] = str(args.master_addr)
    os.environ['MASTER_PORT'] = str(args.master_port)
    # Use TCP backend. Gloo needs nightly, where it currently fails with
    #     dist.init_process_group('gloo', rank=args.rank,
    #   AttributeError: module 'torch.distributed' has no attribute 'init_process_group'
    dist.init_process_group('tcp', rank=args.rank, world_size=args.size)

    tensor = torch.ones(args.size_mb * 250 * 1000) * (args.rank + 1)
    time_list = []
    outfile = 'out' if args.rank == 0 else '/dev/null'
    log = util.FileLogger(outfile)
    for i in range(args.iters):
        # print('before: rank ', args.rank, ' has data ', tensor[0])

        start_time = time.perf_counter()
        if args.rank == 0:
            dist.send(tensor=tensor, dst=1)
        else:
            dist.recv(tensor=tensor, src=0)

        elapsed_time_ms = (time.perf_counter() - start_time) * 1000
        time_list.append(elapsed_time_ms)
        # print('after: rank ', args.rank, ' has data ', tensor[0])
        rate = args.size_mb / (elapsed_time_ms / 1000)

        log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' %
            (i, args.iters, args.size_mb, elapsed_time_ms, rate))

    min = np.min(time_list)
    median = np.median(time_list)
    log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}"
        )
def main():
    global log
    if args.role == "launcher":
        launcher()
    elif args.role == "worker":
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(args.seed)

        log = util.FileLogger(args.logdir + f'/worker-{util.get_global_rank()}', mirror=(args.local_rank == 0))

        torch.cuda.set_device(args.local_rank)
        #      test_p2p()
        if args.method == 'optimize':
            test_optimize()
        elif args.method == 'allreduce':
            test_allreduce()
        else:
            assert False, 'unknown arg'
    else:
        assert False, "Unknown role " + args.role
Beispiel #7
0
def run_driver():
    ray.init(redis_address=args.ip)

    worker_actors = [Worker.remote() for _ in range(args.num_workers)]
    ps_actors = [ParameterServer.remote() for _ in range(args.num_ps)]

    log = util.FileLogger('out')

    time_list = []
    for i in range(args.iters):
        start_time = time.perf_counter()
        grads_list = []
        for actor in worker_actors:
            result = actor.compute_gradients.remote()
            if args.num_ps == 1:
                grads_list.append([result])
            else:
                grads_list.append(result)

        updates = []
        for ps, shards in zip(ps_actors, transpose(grads_list)):
            updates.append(ps.receive.remote(*shards))

        ray.wait(updates, num_returns=args.num_ps)

        elapsed_time_ms = (time.perf_counter() - start_time) * 1000
        time_list.append(elapsed_time_ms)
        rate = args.size_mb / (elapsed_time_ms / 1000)
        log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' %
            (i, args.iters, args.size_mb * args.num_workers, elapsed_time_ms,
             rate))

    min = np.min(time_list)
    median = np.median(time_list)
    log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}"
        )