コード例 #1
0
def run_benchmark(rank, model, data, config):

    world_size = config.trainer_count + config.ps_count + 1
    os.environ['MASTER_ADDR'] = config.master_addr
    os.environ['MASTER_PORT'] = config.master_port
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = config.rpc_init_method
    if rank == world_size - 1:
        # master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count]
        run_master(rank, model, data, config, rpc_backend_options)
    elif rank >= config.trainer_count:
        # parameter_servers = [trainer_count, trainer_count + parameter_server_count)
        rpc.init_rpc(get_name(rank, config),
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    else:
        # trainers = [0, trainer_count)
        trainer_config = config.trainer_config
        ps_config = config.ps_config
        if (USE_CUDA_RPC in trainer_config and trainer_config[USE_CUDA_RPC]
                and USE_CUDA_RPC in ps_config and ps_config[USE_CUDA_RPC]
                and config.ps_count > 0):
            ps_rank = get_parameter_server_rank(rank, config)
            ps_name = get_name(ps_rank, config)
            rpc_backend_options.set_device_map(ps_name, {rank: ps_rank})
        trainer_name = get_name(rank, config)
        rpc.init_rpc(trainer_name,
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    rpc.shutdown()
コード例 #2
0
def run_worker(rank, world_size):
    r"""
    A wrapper function that initializes RPC, calls the function, and shuts down
    RPC.
    """

    # We need to use different port numbers in TCP init_method for init_rpc and
    # init_process_group to avoid port conflicts.
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = "tcp://localhost:29501"

    # Rank 2 is master, 3 is ps and 0 and 1 are trainers.
    if rank == 2:
        rpc.init_rpc(
            "master",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

        # Build the embedding table on the ps.
        emb_rref = rpc.remote(
            "ps",
            torch.nn.EmbeddingBag,
            args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
            kwargs={"mode": "sum"},
        )

        # Run the training loop on trainers.
        futs = []
        for trainer_rank in [0, 1]:
            trainer_name = "trainer{}".format(trainer_rank)
            fut = rpc.rpc_async(trainer_name,
                                _run_trainer,
                                args=(emb_rref, rank))
            futs.append(fut)

        # Wait for all training to finish.
        for fut in futs:
            fut.wait()
    elif rank <= 1:
        # Initialize process group for Distributed DataParallel on trainers.
        dist.init_process_group(backend="gloo",
                                rank=rank,
                                world_size=2,
                                init_method="tcp://localhost:29500")

        # Initialize RPC.
        trainer_name = "trainer{}".format(rank)
        rpc.init_rpc(
            trainer_name,
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

        # Trainer just waits for RPCs from master.
    else:
        rpc.init_rpc(
            "ps",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )
        # parameter server do nothing
        pass

    # block until all rpcs finish
    rpc.shutdown()
コード例 #3
0
def run_worker(rank, world_size):
    r"""
   A wrapper function that initializes RPC, calls the function, and shuts down
   RPC.
   """

    # Using different port numbers in TCP init_method for init_rpc and
    # init_process_group to avoid port conflicts.
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = "tcp://localhost:29500"

    # Rank 16. Master
    if rank == (NUM_TRAINERS + NUM_PS):

        rpc.init_rpc(
            "master",
            rank=rank,
            backend=BackendType.TENSORPIPE,  # type: ignore[attr-defined]
            world_size=world_size)

        # Build the Embedding tables on the Parameter Servers.
        emb_rref_list = []
        index = 0
        while index < NUM_PS:
            ps_name = "ps{}".format(index)
            emb_rref = rpc.remote(
                ps_name,
                torch.nn.EmbeddingBag,
                args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
                kwargs={"mode": "sum"},
            )
            emb_rref_list.append(emb_rref)
            index += 1

        # Run training loop on the trainers.
        futs = []
        for trainer_rank in range(NUM_TRAINERS):
            trainer_name = "trainer{}".format(trainer_rank)
            fut = rpc.rpc_async(trainer_name,
                                _run_trainer,
                                args=(emb_rref_list, trainer_rank))
            futs.append(fut)

        _print_header()

        measurements_all_trainers = []
        batch_size_all_trainers = 0
        # Wait for all training to finish.
        for fut in futs:
            rank, measurements, batch_size = fut.wait()
            _print_benchmark("Trainer{}".format(rank), batch_size,
                             measurements)
            batch_size_all_trainers += batch_size
            measurements_all_trainers.append(measurements)

        _print_benchmark("All", batch_size_all_trainers,
                         measurements_all_trainers)

    # Rank 0-7. Trainers
    elif rank >= 0 and rank < NUM_PS:

        # Initialize process group for Distributed DataParallel on trainers.
        dist.init_process_group(
            backend=dist.Backend.GLOO,
            rank=rank,
            world_size=NUM_TRAINERS,
            init_method="tcp://localhost:29501",
        )

        # Initialize RPC. Trainer just waits for RPCs from master.
        trainer_name = "trainer{}".format(rank)
        rpc.init_rpc(
            trainer_name,
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

    # Rank 8-15. Parameter Servers
    elif rank >= NUM_TRAINERS and rank < NUM_TRAINERS + NUM_PS:
        ps_name = "ps{}".format(rank - NUM_TRAINERS)
        rpc.init_rpc(
            ps_name,
            rank=rank,
            world_size=world_size,
            backend=BackendType.TENSORPIPE,  # type: ignore[attr-defined]
            rpc_backend_options=rpc_backend_options,
        )
        # parameter server do nothing
        pass

    # block until all rpcs finish
    rpc.shutdown()