def talk_to_client(client_ep): global args global cb_not_done msg_log = max_msg_log iters = max_iters comm_ep = client_ep send_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) print("{}\t\t{}".format("Size (bytes)", "Uni-Bandwidth (GB/s)")) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) while len(pending_list) > 0: for ft in pending_list: if ft.done() == True: pending_list.remove(ft) start = time.time() for j in range(iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) while len(pending_list) > 0: for ft in pending_list: if ft.done() == True: pending_list.remove(ft) end = time.time() lat = end - start #lat = ((lat/2) / iters)* 1000000 bw = (iters * window_size * msg_len) / lat bw = bw / 1e9 #GB/s print("{}\t\t{}".format(msg_len, bw)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() else: send_buffer_region.free_host() ucp.destroy_ep(client_ep) cb_not_done = False ucp.stop_server()
async def talk_to_server(ip, port): global args msg_log = max_msg_log iters = max_iters server_ep = ucp.get_endpoint(ip, port) comm_ep = server_ep send_buffer_region = ucp.buffer_region() recv_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) recv_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) recv_buffer_region.alloc_host(1 << msg_log) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = ucp.ucp_msg(recv_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) recv_ft = comm_ep.recv(recv_msg, msg_len) pending_list.append(send_ft) pending_list.append(recv_ft) await asyncio.wait(pending_list) start = time.time() for j in range(iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = ucp.ucp_msg(recv_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) recv_ft = comm_ep.recv(recv_msg, msg_len) pending_list.append(send_ft) pending_list.append(recv_ft) await asyncio.wait(pending_list) end = time.time() lat = end - start lat = ((lat / 2) / iters) * 1000000 if args.mem_type == 'cuda': send_buffer_region.free_cuda() recv_buffer_region.free_cuda() else: send_buffer_region.free_host() recv_buffer_region.free_host() ucp.destroy_ep(server_ep)
def talk_to_server(ip, port): global args msg_log = max_msg_log iters = max_iters server_ep = ucp.get_endpoint(ip, port) comm_ep = server_ep send_buffer_region = ucp.buffer_region() recv_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) recv_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) recv_buffer_region.alloc_host(1 << msg_log) for i in range(msg_log): msg_len = 2 ** i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = ucp.ucp_msg(recv_buffer_region) recv_req = comm_ep.recv(recv_msg, msg_len) recv_req.result() send_req = comm_ep.send(send_msg, msg_len) send_req.result() send_msg = [] recv_msg = [] for j in range(iters): send_msg.append(ucp.ucp_msg(send_buffer_region)) recv_msg.append(ucp.ucp_msg(recv_buffer_region)) start = time.time() for j in range(iters): recv_req = comm_ep.recv(recv_msg[j], msg_len) recv_req.result() send_req = comm_ep.send(send_msg[j], msg_len) send_req.result() end = time.time() lat = end - start lat = ((lat/2) / iters)* 1000000 if args.mem_type == 'cuda': send_buffer_region.free_cuda() recv_buffer_region.free_cuda() else: send_buffer_region.free_host() recv_buffer_region.free_host() ucp.destroy_ep(server_ep)
async def talk_to_server(ip, port): global max_msg_log global args start_string = "in talk_to_server" if args.blind_recv: start_string += " + blind recv" if args.check_data: start_string += " + data validity check" print(start_string) msg_log = max_msg_log ep = ucp.get_endpoint(ip, port) send_buffer_region = ucp.buffer_region() send_buffer_region.alloc_host(1 << msg_log) send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = None recv_buffer_region = None recv_req = None if not args.blind_recv: recv_buffer_region = ucp.buffer_region() recv_buffer_region.alloc_host(1 << msg_log) recv_msg = ucp.ucp_msg(recv_buffer_region) if args.check_data: send_msg.set_mem(1, 1 << msg_log) if not args.blind_recv: recv_msg.set_mem(1, 1 << msg_log) if not args.blind_recv: recv_req = await ep.recv(recv_msg, 1 << msg_log) else: recv_req = await ep.recv_future() send_req = await ep.send(send_msg, 1 << msg_log) if args.check_data: errs = 0 errs = recv_req.check_mem(0, 1 << msg_log) print("num errs: " + str(errs)) send_buffer_region.free_host() if not args.blind_recv: recv_buffer_region.free_host() ucp.destroy_ep(ep) print("done with talk_to_server")
async def talk_to_client(client_ep): global args msg_log = max_msg_log iters = max_iters comm_ep = client_ep send_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) print("{}\t\t{}".format("Size (bytes)", "Bi-Bandwidth (GB/s)")) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) await asyncio.wait(pending_list) start = time.time() for j in range(iters): pending_list = [] for k in range(window_size): send_msg = ucp.ucp_msg(send_buffer_region) send_ft = comm_ep.send(send_msg, msg_len) pending_list.append(send_ft) await asyncio.wait(pending_list) end = time.time() lat = end - start bw = (iters * window_size * msg_len) / lat bw = bw / 1e9 #GB/s print("{}\t\t{}".format(msg_len, bw)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() else: send_buffer_region.free_host() ucp.destroy_ep(client_ep) ucp.stop_server()
async def talk_to_server(ip, port): global args global max_msg_log msg_log = max_msg_log start_string = "in talk_to_server" if args.blind_recv: start_string += " + blind recv" if args.use_fast: start_string += " + using fast ops" print(start_string) ep = ucp.get_endpoint(ip, port) print("got endpoint") send_buffer_region = ucp.buffer_region() send_buffer_region.alloc_cuda(1 << msg_log) recv_msg = None recv_buffer_region = None recv_req = None if not args.blind_recv: recv_buffer_region = ucp.buffer_region() recv_buffer_region.alloc_cuda(1 << msg_log) recv_msg = ucp.ucp_msg(recv_buffer_region) send_msg = ucp.ucp_msg(send_buffer_region) if not args.blind_recv: if args.use_fast: recv_req = await ep.recv_fast(recv_msg, 1 << msg_log) else: recv_req = await ep.recv(recv_msg, 1 << msg_log) else: recv_req = await ep.recv_future() if args.use_fast: send_req = await ep.send_fast(send_msg, 1 << msg_log) else: send_req = await ep.send(send_msg, 1 << msg_log) send_buffer_region.free_cuda() if not args.blind_recv: recv_buffer_region.free_cuda() ucp.destroy_ep(ep) print("passed talk_to_server")
def talk_to_server(ip, port): global args msg_log = max_msg_log iters = max_iters server_ep = ucp.get_endpoint(ip, port) comm_ep = server_ep recv_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': recv_buffer_region.alloc_cuda(1 << msg_log) else: recv_buffer_region.alloc_host(1 << msg_log) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): pending_list = [] for k in range(window_size): recv_msg = ucp.ucp_msg(recv_buffer_region) recv_ft = comm_ep.recv(recv_msg, msg_len) pending_list.append(recv_ft) while len(pending_list) > 0: for ft in pending_list: if ft.done() == True: pending_list.remove(ft) for j in range(iters): pending_list = [] for k in range(window_size): recv_msg = ucp.ucp_msg(recv_buffer_region) recv_ft = comm_ep.recv(recv_msg, msg_len) pending_list.append(recv_ft) while len(pending_list) > 0: for ft in pending_list: if ft.done() == True: pending_list.remove(ft) if args.mem_type == 'cuda': recv_buffer_region.free_cuda() else: recv_buffer_region.free_host() ucp.destroy_ep(server_ep)
async def talk_to_client(client_ep): global args msg_log = max_msg_log iters = max_iters send_buffer_region = ucp.buffer_region() recv_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) recv_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) recv_buffer_region.alloc_host(1 << msg_log) send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = ucp.ucp_msg(recv_buffer_region) print("{}\t\t{}".format("Size (bytes)", "Latency (us)")) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): send_req = await client_ep.send_fast(send_msg, msg_len) recv_req = await client_ep.recv_fast(recv_msg, msg_len) start = time.time() for j in range(iters): send_req = await client_ep.send_fast(send_msg, msg_len) recv_req = await client_ep.recv_fast(recv_msg, msg_len) end = time.time() lat = end - start lat = ((lat / 2) / iters) * 1000000 print("{}\t\t{}".format(msg_len, lat)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() recv_buffer_region.free_cuda() else: send_buffer_region.free_host() recv_buffer_region.free_host() ucp.destroy_ep(client_ep) ucp.stop_server()
async def talk_to_server(ip, port): global args msg_log = max_msg_log iters = max_iters server_ep = ucp.get_endpoint(ip, port) send_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) send_msg = ucp.ucp_msg(send_buffer_region) for i in range(msg_log): msg_len = 2**i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): recv_req = server_ep.recv_future() await recv_req send_req = await server_ep.send(send_msg, msg_len) start = time.time() for j in range(iters): recv_req = server_ep.recv_future() await recv_req send_req = await server_ep.send(send_msg, msg_len) end = time.time() lat = end - start lat = ((lat / 2) / iters) * 1000000 if args.mem_type == 'cuda': send_buffer_region.free_cuda() else: send_buffer_region.free_host() ucp.destroy_ep(server_ep)
def talk_to_client(client_ep): global args global cb_not_done msg_log = max_msg_log iters = max_iters comm_ep = client_ep send_buffer_region = ucp.buffer_region() recv_buffer_region = ucp.buffer_region() if args.mem_type == 'cuda': send_buffer_region.alloc_cuda(1 << msg_log) recv_buffer_region.alloc_cuda(1 << msg_log) else: send_buffer_region.alloc_host(1 << msg_log) recv_buffer_region.alloc_host(1 << msg_log) print("{}\t\t{}\t\t{}\t\t{}".format("Size (bytes)", "Latency (us)", "Issue (us)", "Progress (us)")) for i in range(msg_log): msg_len = 2 ** i warmup_iters = int((0.1 * iters)) for j in range(warmup_iters): send_msg = ucp.ucp_msg(send_buffer_region) recv_msg = ucp.ucp_msg(recv_buffer_region) send_req = comm_ep.send(send_msg, msg_len) recv_req = comm_ep.recv(recv_msg, msg_len) send_req.result() recv_req.result() send_msg = [] recv_msg = [] for j in range(iters): send_msg.append(ucp.ucp_msg(send_buffer_region)) recv_msg.append(ucp.ucp_msg(recv_buffer_region)) start = time.time() issue_lat = 0 progress_lat = 0 for j in range(iters): tmp_start = time.time() send_req = comm_ep.send(send_msg[j], msg_len) tmp_end = time.time() issue_lat += (tmp_end - tmp_start) tmp_start = time.time() send_req.result() tmp_end = time.time() progress_lat += (tmp_end - tmp_start) tmp_start = time.time() recv_req = comm_ep.recv(recv_msg[j], msg_len) tmp_end = time.time() issue_lat += (tmp_end - tmp_start) tmp_start = time.time() recv_req.result() tmp_end = time.time() progress_lat += (tmp_end - tmp_start) end = time.time() lat = end - start lat = ((lat/2) / iters)* 1000000 issue_lat = ((issue_lat/2) / iters)* 1000000 progress_lat = ((progress_lat/2) / iters)* 1000000 print("{}\t\t{}\t\t{}\t\t{}".format(msg_len, lat, issue_lat, progress_lat)) if args.mem_type == 'cuda': send_buffer_region.free_cuda() recv_buffer_region.free_cuda() else: send_buffer_region.free_host() recv_buffer_region.free_host() ucp.destroy_ep(client_ep) cb_not_done = False