Ejemplo n.º 1
0
parser.add_argument('-p',
                    '--port',
                    help='enter server port number',
                    required=False)
parser.add_argument('-i', '--intra_node', action='store_true')
parser.add_argument('-m',
                    '--mem_type',
                    help='host/cuda (default = host)',
                    required=False)
args = parser.parse_args()

## initiate ucp
init_str = ""
server = False
cb_not_done = True
if args.server is None:
    server = True
else:
    server = False
    init_str = args.server

ucp.init()
if server:
    if args.intra_node:
        ucp.set_cuda_dev(1)
    ucp.start_server(talk_to_client, is_coroutine=False)
    while cb_not_done:
        ucp.progress()
else:
    talk_to_server(init_str.encode(), int(args.port))
    async def run():
        # Receive worker address from server via multiprocessing.Queue
        remote_address = ucp.get_ucx_address_from_buffer(q1.get())

        if error_type == "unreachable":
            with pytest.raises(
                    ucp.exceptions.UCXError,
                    match="Destination is unreachable|Endpoint timeout",
            ):
                # Here, two cases may happen:
                # 1. With TCP creating endpoint will immediately raise
                #    "Destination is unreachable"
                # 2. With rc/ud creating endpoint will succeed, but raise
                #    "Endpoint timeout" after UCX_UD_TIMEOUT seconds have passed.
                #    We need to keep progressing UCP until timeout is raised.
                ep = await ucp.create_endpoint_from_worker_address(
                    remote_address)

                start = time.monotonic()
                while not ep._ep.raise_on_error():
                    ucp.progress()

                    # Prevent hanging
                    if time.monotonic() - start >= 1.0:
                        return
        else:
            # Create endpoint to remote worker, and:
            #
            # 1. For timeout_send:
            #    - inform remote worker that local endpoint is ready for remote
            #      shutdown;
            #    - wait for remote worker to shutdown and confirm;
            #    - attempt to send message.
            #
            # 2. For timeout_recv:
            #    - schedule ep.recv;
            #    - inform remote worker that local endpoint is ready for remote
            #      shutdown;
            #    - wait for it to shutdown and confirm
            #    - wait for recv message.
            ep = await ucp.create_endpoint_from_worker_address(remote_address)

            if error_type == "timeout_send":
                q2.put("ready")

                remote_disconnected = q1.get()
                assert remote_disconnected == "disconnected"

                with pytest.raises(ucp.exceptions.UCXError,
                                   match="Endpoint timeout"):
                    await asyncio.wait_for(ep.send(np.zeros(10),
                                                   tag=0,
                                                   force_tag=True),
                                           timeout=1.0)
            else:
                with pytest.raises(ucp.exceptions.UCXCanceled):
                    msg = np.empty(10)
                    task = asyncio.wait_for(ep.recv(msg, tag=0,
                                                    force_tag=True),
                                            timeout=3.0)

                    q2.put("ready")

                    remote_disconnected = q1.get()
                    assert remote_disconnected == "disconnected"

                    await task