def talk_to_client(client_ep): global args global cb_not_done msg_log = max_msg_log iters = max_iters comm_ep = client_ep send_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) print("{}\t\t{}".format("Size (bytes)", "Uni-Bandwidth (GB/s)")) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) while len(pending_list) > 0: for ft in pending_list: if ft.done() == True: pending_list.remove(ft) start = time.time() for j in range(iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) while len(pending_list) > 0: for ft in pending_list: if ft.done() == True: pending_list.remove(ft) end = time.time() lat = end - start #lat = ((lat/2) / iters)* 1000000 bw = (iters * window_size * msg_len) / lat bw = bw / 1e9 #GB/s print("{}\t\t{}".format(msg_len, bw)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() else: send_buffer_region.free_host() ucp.destroy_ep(client_ep) cb_not_done = False ucp.stop_server()
async def talk_to_client(client_ep): global args msg_log = max_msg_log iters = max_iters comm_ep = client_ep send_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) print("{}\t\t{}".format("Size (bytes)", "Bi-Bandwidth (GB/s)")) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) await asyncio.wait(pending_list) start = time.time() for j in range(iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) await asyncio.wait(pending_list) end = time.time() lat = end - start bw = (iters * window_size * msg_len) / lat bw = bw / 1e9 #GB/s print("{}\t\t{}".format(msg_len, bw)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() else: send_buffer_region.free_host() ucp.destroy_ep(client_ep) ucp.stop_server()
async def talk_to_client(client_ep): global args msg_log = max_msg_log iters = max_iters send_buffer_region = ucp.buffer_region() recv_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) recv_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) recv_buffer_region.alloc_host(1 << msg_log) send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = ucp.ucp_msg(recv_buffer_region) print("{}\t\t{}".format("Size (bytes)", "Latency (us)")) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): send_req = await client_ep.send_fast(send_msg, msg_len) recv_req = await client_ep.recv_fast(recv_msg, msg_len) start = time.time() for j in range(iters): send_req = await client_ep.send_fast(send_msg, msg_len) recv_req = await client_ep.recv_fast(recv_msg, msg_len) end = time.time() lat = end - start lat = ((lat / 2) / iters) * 1000000 print("{}\t\t{}".format(msg_len, lat)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() recv_buffer_region.free_cuda() else: send_buffer_region.free_host() recv_buffer_region.free_host() ucp.destroy_ep(client_ep) ucp.stop_server()