def run_driver(): ray.init(redis_address=args.ip) worker = Worker.remote() ps = ParameterServer.remote() log = util.FileLogger('out') log(f"Worker ip {ray.get(worker.ip.remote())}") log(f"PS ip {ray.get(ps.ip.remote())}") log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") time_list = [] for i in range(args.iters): start_time = time.perf_counter() results = [] for layer_idx in range(args.num_layers): grad = worker.compute_gradient.remote(layer_idx) results.append(ps.receive.remote(grad, layer_idx)) ray.wait(results) elapsed_time_ms = (time.perf_counter() - start_time) * 1000 time_list.append(elapsed_time_ms) rate = gradient_size / (elapsed_time_ms / 1000) / 1e6 log(f'{i:03d}/{args.iters} sent {gradient_size/1e6:.0f} MBs in ' f'{elapsed_time_ms:.1f} ms: {rate:.2f} MB/second') min_ = np.min(time_list) median = np.median(time_list) mean = np.mean(time_list) def gbps(time_ms): return 8 * gradient_size / (time_ms / 1000) / 1e9 log(f"Gbps: mean: {gbps(mean):8.2f}, median: {gbps(median):8.2f}, max: {gbps(min_):8.2f}")
def run_driver(): ray.init(redis_address=args.ip) worker = Worker.remote() ps = ParameterServer.remote() log = util.FileLogger('out') log(f"Worker ip {ray.get(worker.ip.remote())}") log(f"PS ip {ray.get(ps.ip.remote())}") log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") time_list = [] for i in range(args.iters): start_time = time.perf_counter() grads = worker.compute_gradients.remote() result = ps.receive.remote(grads) ray.wait([result]) elapsed_time_ms = (time.perf_counter() - start_time) * 1000 time_list.append(elapsed_time_ms) rate = args.size_mb / (elapsed_time_ms / 1000) log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) min = np.min(time_list) median = np.median(time_list) log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}" )
def run_worker(): param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: log = util.FileLogger('/tmp/out') # log = util.FileLogger('/dev/null', mirror=False) else: log = util.FileLogger('/dev/null', mirror=False) grads_array = [] time_list = [] dim = args.size_mb * 250 * 1000 dtype = np.float32 data = np.ones(dim, dtype=dtype) * (rank + 1) for i in range(args.iters): start_time = time.perf_counter() if rank == 0: comm.Send(data, dest=1, tag=13) else: data = np.empty(dim, dtype=dtype) comm.Recv(data, source=0, tag=13) end_time = time.perf_counter() elapsed_time_ms = (end_time - start_time) * 1000 time_list.append(elapsed_time_ms) rate = args.size_mb / (elapsed_time_ms / 1000) log(f'{rank} {i:03d}/{args.iters:d} sent {args.size_mb:d} MBs in {elapsed_time_ms:.1f}' f' ms: {rate:.2f} MB/second') min = np.min(time_list) median = np.median(time_list) log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}" )
def run_sender(): param_size = 250 * 1000 * args.size_mb // args.shards # 1MB is 250k integers log = util.FileLogger('out') grads_array = [] with tf.device('/job:chief/task:0'): # grads = tf.fill([param_size], 1.) for i in range(args.shards): grads = tf.Variable(tf.ones([param_size])) grads_array.append(grads) params_array = [] add_op_array = [] with tf.device('/job:receiver/task:0'): for i in range(args.shards): params = tf.Variable(tf.ones([param_size])) add_op = params.assign(grads_array[i]).op params_array.append(params) add_op_array.append(add_op) add_op = tf.group(*add_op_array) server = _launch_server('chief') sess = tf.Session(server.target) sess.run(tf.global_variables_initializer()) # except Exception as e: # # sometimes .run fails with .UnavailableError: OS Error # log(f"initialization failed with {e}, retrying in 1 second") # time.sleep(1) time_list = [] for i in range(args.iters): start_time = time.perf_counter() sess.run(add_op) elapsed_time_ms = (time.perf_counter() - start_time) * 1000 time_list.append(elapsed_time_ms) rate = args.size_mb / (elapsed_time_ms / 1000) log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % ( i, args.iters, args.size_mb, elapsed_time_ms, rate)) min = np.min(time_list) median = np.median(time_list) log( f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
def worker(): """ Initialize the distributed environment. """ import torch import torch.distributed as dist from torch.multiprocessing import Process import numpy as np print("Initializing distributed pytorch") os.environ['MASTER_ADDR'] = str(args.master_addr) os.environ['MASTER_PORT'] = str(args.master_port) # Use TCP backend. Gloo needs nightly, where it currently fails with # dist.init_process_group('gloo', rank=args.rank, # AttributeError: module 'torch.distributed' has no attribute 'init_process_group' dist.init_process_group('tcp', rank=args.rank, world_size=args.size) tensor = torch.ones(args.size_mb * 250 * 1000) * (args.rank + 1) time_list = [] outfile = 'out' if args.rank == 0 else '/dev/null' log = util.FileLogger(outfile) for i in range(args.iters): # print('before: rank ', args.rank, ' has data ', tensor[0]) start_time = time.perf_counter() if args.rank == 0: dist.send(tensor=tensor, dst=1) else: dist.recv(tensor=tensor, src=0) elapsed_time_ms = (time.perf_counter() - start_time) * 1000 time_list.append(elapsed_time_ms) # print('after: rank ', args.rank, ' has data ', tensor[0]) rate = args.size_mb / (elapsed_time_ms / 1000) log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) min = np.min(time_list) median = np.median(time_list) log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}" )
def main(): global log if args.role == "launcher": launcher() elif args.role == "worker": np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) log = util.FileLogger(args.logdir + f'/worker-{util.get_global_rank()}', mirror=(args.local_rank == 0)) torch.cuda.set_device(args.local_rank) # test_p2p() if args.method == 'optimize': test_optimize() elif args.method == 'allreduce': test_allreduce() else: assert False, 'unknown arg' else: assert False, "Unknown role " + args.role
def run_driver(): ray.init(redis_address=args.ip) worker_actors = [Worker.remote() for _ in range(args.num_workers)] ps_actors = [ParameterServer.remote() for _ in range(args.num_ps)] log = util.FileLogger('out') time_list = [] for i in range(args.iters): start_time = time.perf_counter() grads_list = [] for actor in worker_actors: result = actor.compute_gradients.remote() if args.num_ps == 1: grads_list.append([result]) else: grads_list.append(result) updates = [] for ps, shards in zip(ps_actors, transpose(grads_list)): updates.append(ps.receive.remote(*shards)) ray.wait(updates, num_returns=args.num_ps) elapsed_time_ms = (time.perf_counter() - start_time) * 1000 time_list.append(elapsed_time_ms) rate = args.size_mb / (elapsed_time_ms / 1000) log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb * args.num_workers, elapsed_time_ms, rate)) min = np.min(time_list) median = np.median(time_list) log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}" )