parser.add_argument('-p', '--port', help='enter server port number', required=False) parser.add_argument('-i', '--intra_node', action='store_true') parser.add_argument('-m', '--mem_type', help='host/cuda (default = host)', required=False) args = parser.parse_args() ## initiate ucp init_str = "" server = False cb_not_done = True if args.server is None: server = True else: server = False init_str = args.server ucp.init() if server: if args.intra_node: ucp.set_cuda_dev(1) ucp.start_server(talk_to_client, is_coroutine=False) while cb_not_done: ucp.progress() else: talk_to_server(init_str.encode(), int(args.port))
async def run(): # Receive worker address from server via multiprocessing.Queue remote_address = ucp.get_ucx_address_from_buffer(q1.get()) if error_type == "unreachable": with pytest.raises( ucp.exceptions.UCXError, match="Destination is unreachable|Endpoint timeout", ): # Here, two cases may happen: # 1. With TCP creating endpoint will immediately raise # "Destination is unreachable" # 2. With rc/ud creating endpoint will succeed, but raise # "Endpoint timeout" after UCX_UD_TIMEOUT seconds have passed. # We need to keep progressing UCP until timeout is raised. ep = await ucp.create_endpoint_from_worker_address( remote_address) start = time.monotonic() while not ep._ep.raise_on_error(): ucp.progress() # Prevent hanging if time.monotonic() - start >= 1.0: return else: # Create endpoint to remote worker, and: # # 1. For timeout_send: # - inform remote worker that local endpoint is ready for remote # shutdown; # - wait for remote worker to shutdown and confirm; # - attempt to send message. # # 2. For timeout_recv: # - schedule ep.recv; # - inform remote worker that local endpoint is ready for remote # shutdown; # - wait for it to shutdown and confirm # - wait for recv message. ep = await ucp.create_endpoint_from_worker_address(remote_address) if error_type == "timeout_send": q2.put("ready") remote_disconnected = q1.get() assert remote_disconnected == "disconnected" with pytest.raises(ucp.exceptions.UCXError, match="Endpoint timeout"): await asyncio.wait_for(ep.send(np.zeros(10), tag=0, force_tag=True), timeout=1.0) else: with pytest.raises(ucp.exceptions.UCXCanceled): msg = np.empty(10) task = asyncio.wait_for(ep.recv(msg, tag=0, force_tag=True), timeout=3.0) q2.put("ready") remote_disconnected = q1.get() assert remote_disconnected == "disconnected" await task