Beispiel #1
0
def run_benchmark(rank, model, data, config):

    world_size = config.trainer_count + config.ps_count + 1
    os.environ['MASTER_ADDR'] = config.master_addr
    os.environ['MASTER_PORT'] = config.master_port
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = config.rpc_init_method
    if rank == world_size - 1:
        # master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count]
        run_master(rank, model, data, config, rpc_backend_options)
    elif rank >= config.trainer_count:
        # parameter_servers = [trainer_count, trainer_count + parameter_server_count)
        rpc.init_rpc(get_name(rank, config),
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    else:
        # trainers = [0, trainer_count)
        trainer_config = config.trainer_config
        ps_config = config.ps_config
        if (USE_CUDA_RPC in trainer_config and trainer_config[USE_CUDA_RPC]
                and USE_CUDA_RPC in ps_config and ps_config[USE_CUDA_RPC]
                and config.ps_count > 0):
            ps_rank = get_parameter_server_rank(rank, config)
            ps_name = get_name(ps_rank, config)
            rpc_backend_options.set_device_map(ps_name, {rank: ps_rank})
        trainer_name = get_name(rank, config)
        rpc.init_rpc(trainer_name,
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    rpc.shutdown()
Beispiel #2
0
def run_benchmark(rank, model, data, args, config):

    torch.manual_seed(args.torch_seed)
    torch.cuda.manual_seed_all(args.cuda_seed)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
    os.environ['MASTER_ADDR'] = args.master_addr
    os.environ['MASTER_PORT'] = args.master_port
    rpc_backend_options = TensorPipeRpcBackendOptions(
        rpc_timeout=args.rpc_timeout)
    if rank == world_size - 1:
        # master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver]
        run_master(rank, model, data, args, config, rpc_backend_options)
    elif rank >= args.ntrainer + args.ncudatrainer:
        # parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver)
        rpc.init_rpc(get_name(rank, args),
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    else:
        # trainers = [0, ntrainer + ncudatrainer)
        if rank >= args.ntrainer:
            server_rank = get_cuda_server_rank(args, rank)
            server_name = get_name(server_rank, args)
            rpc_backend_options.set_device_map(server_name,
                                               {rank: server_rank})
        trainer_name = get_name(rank, args)
        rpc.init_rpc(trainer_name,
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    rpc.shutdown()
Beispiel #3
0
def run_benchmark(rank, args, data):
    r"""
    A function that runs the benchmark.
    Args:
        rank (int): process number in the world
        args (parser): configuration args
        data (list): training samples
    """

    config = load_extra_configs(args)

    torch.manual_seed(args.torch_seed)
    torch.cuda.manual_seed_all(args.cuda_seed)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
    os.environ['MASTER_ADDR'] = args.master_addr
    os.environ['MASTER_PORT'] = args.master_port
    rpc_backend_options = TensorPipeRpcBackendOptions(rpc_timeout=args.rpc_timeout)
    if rank == world_size - 1:
        # master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver]
        run_master(rank, data, args, config, rpc_backend_options)
    elif rank >= args.ntrainer + args.ncudatrainer:
        # parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver)
        rpc.init_rpc(
            get_name(
                rank,
                args
            ),
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options
        )
    else:
        # trainers = [0, ntrainer + ncudatrainer)
        if rank >= args.ntrainer:
            server_rank = get_cuda_server_rank(args, rank)
            server_name = get_name(server_rank, args)
            rpc_backend_options.set_device_map(
                server_name,
                {rank: server_rank}
            )
        trainer_name = get_name(
            rank,
            args
        )
        rpc.init_rpc(
            trainer_name,
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options
        )
    rpc.shutdown()
Beispiel #4
0
def run_worker(rank, world_size):
    r"""
    A wrapper function that initializes RPC, calls the function, and shuts down
    RPC.
    """

    # We need to use different port numbers in TCP init_method for init_rpc and
    # init_process_group to avoid port conflicts.
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = "tcp://localhost:29501"

    # Rank 2 is master, 3 is ps and 0 and 1 are trainers.
    if rank == 2:
        rpc.init_rpc(
            "master",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

        # Build the embedding table on the ps.
        emb_rref = rpc.remote(
            "ps",
            torch.nn.EmbeddingBag,
            args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
            kwargs={"mode": "sum"},
        )

        # Run the training loop on trainers.
        futs = []
        for trainer_rank in [0, 1]:
            trainer_name = "trainer{}".format(trainer_rank)
            fut = rpc.rpc_async(trainer_name,
                                _run_trainer,
                                args=(emb_rref, rank))
            futs.append(fut)

        # Wait for all training to finish.
        for fut in futs:
            fut.wait()
    elif rank <= 1:
        # Initialize process group for Distributed DataParallel on trainers.
        dist.init_process_group(backend="gloo",
                                rank=rank,
                                world_size=2,
                                init_method="tcp://localhost:29500")

        # Initialize RPC.
        trainer_name = "trainer{}".format(rank)
        rpc.init_rpc(
            trainer_name,
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

        # Trainer just waits for RPCs from master.
    else:
        rpc.init_rpc(
            "ps",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )
        # parameter server do nothing
        pass

    # block until all rpcs finish
    rpc.shutdown()
Beispiel #5
0
def run_worker(rank, world_size):
    r"""
   A wrapper function that initializes RPC, calls the function, and shuts down
   RPC.
   """

    # Using different port numbers in TCP init_method for init_rpc and
    # init_process_group to avoid port conflicts.
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = "tcp://localhost:29500"

    # Rank 16. Master
    if rank == (NUM_TRAINERS + NUM_PS):

        rpc.init_rpc(
            "master",
            rank=rank,
            backend=BackendType.TENSORPIPE,  # type: ignore[attr-defined]
            world_size=world_size)

        # Build the Embedding tables on the Parameter Servers.
        emb_rref_list = []
        index = 0
        while index < NUM_PS:
            ps_name = "ps{}".format(index)
            emb_rref = rpc.remote(
                ps_name,
                torch.nn.EmbeddingBag,
                args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
                kwargs={"mode": "sum"},
            )
            emb_rref_list.append(emb_rref)
            index += 1

        # Run training loop on the trainers.
        futs = []
        for trainer_rank in range(NUM_TRAINERS):
            trainer_name = "trainer{}".format(trainer_rank)
            fut = rpc.rpc_async(trainer_name,
                                _run_trainer,
                                args=(emb_rref_list, trainer_rank))
            futs.append(fut)

        _print_header()

        measurements_all_trainers = []
        batch_size_all_trainers = 0
        # Wait for all training to finish.
        for fut in futs:
            rank, measurements, batch_size = fut.wait()
            _print_benchmark("Trainer{}".format(rank), batch_size,
                             measurements)
            batch_size_all_trainers += batch_size
            measurements_all_trainers.append(measurements)

        _print_benchmark("All", batch_size_all_trainers,
                         measurements_all_trainers)

    # Rank 0-7. Trainers
    elif rank >= 0 and rank < NUM_PS:

        # Initialize process group for Distributed DataParallel on trainers.
        dist.init_process_group(
            backend=dist.Backend.GLOO,
            rank=rank,
            world_size=NUM_TRAINERS,
            init_method="tcp://localhost:29501",
        )

        # Initialize RPC. Trainer just waits for RPCs from master.
        trainer_name = "trainer{}".format(rank)
        rpc.init_rpc(
            trainer_name,
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

    # Rank 8-15. Parameter Servers
    elif rank >= NUM_TRAINERS and rank < NUM_TRAINERS + NUM_PS:
        ps_name = "ps{}".format(rank - NUM_TRAINERS)
        rpc.init_rpc(
            ps_name,
            rank=rank,
            world_size=world_size,
            backend=BackendType.TENSORPIPE,  # type: ignore[attr-defined]
            rpc_backend_options=rpc_backend_options,
        )
        # parameter server do nothing
        pass

    # block until all rpcs finish
    rpc.shutdown()