def run(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    options=rpc.ProcessGroupRpcBackendOptions(
        num_send_recv_threads=16,
        rpc_timeout=0  # infinite timeout
     )
    if rank != 0:
        rpc.init_rpc(
            f"trainer{rank}",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=options
        )
        # trainer passively waiting for ps to kick off training iterations
    else:
        rpc.init_rpc(
            "ps",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=options
        )
        run_ps([f"trainer{r}" for r in range(1, world_size)])

    # block until all rpcs finish
    rpc.shutdown()
Beispiel #2
0
def bench_mpi(args):
    guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
    os.environ["UCX_NET_DEVICES"] = best_device_map[guess_rank]

    torch.distributed.init_process_group(backend="mpi")
    os.environ["MASTER_ADDR"] = args.host
    os.environ["MASTER_PORT"] = "10639"
    if args.socket_name:
        os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name
        os.environ["TP_SOCKET_IFNAME"] = args.socket_name
    init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()
    torch.cuda.set_device(rank % torch.cuda.device_count())

    rpc.init_rpc(
        f"Test{rank}",
        rank=rank,
        world_size=world_size,
        backend=rpc.BackendType.PROCESS_GROUP,
        rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
            rpc_timeout=20, init_method=init_method),
    )

    initialize_model_parallel(1, world_size)
    init_random_seed(0)

    run_mp_worker(args, world_size)

    rpc.shutdown()
    torch.distributed.destroy_process_group()
Beispiel #3
0
    def test_init_pg_and_rpc_with_same_socket(self):
        addr = DEFAULT_HOSTNAME
        port = common.find_free_port()

        os.environ["MASTER_ADDR"] = addr
        os.environ["MASTER_PORT"] = str(port)

        # We internally use a multi-tenant TCP store. Both PG and RPC should successfully
        # initialize even when using the same socket address.

        dist.init_process_group(
            backend="gloo",
            init_method="env://",
            rank=0,
            world_size=1,
        )

        backend_opts = rpc.ProcessGroupRpcBackendOptions(
            init_method=f"tcp://{addr}:{port}"
        )
        rpc.init_rpc(
            name="worker0",
            rank=0,
            world_size=1,
            rpc_backend_options=backend_opts,
        )

        rpc.shutdown()
Beispiel #4
0
def benchmark_multiprocess(rank, world_size, args):

    init_method_pgroup = "tcp://localhost:{}".format(MPI_PORT)
    # TODO(anj-s): Add regression benchmarks for nccl as well.
    torch.distributed.init_process_group(
        backend="gloo", rank=rank, world_size=world_size, init_method=init_method_pgroup
    )

    torch.cuda.set_device(rank % torch.cuda.device_count())
    # TODO(anj-s): Move to TensorPipeRpcBackendOptions.
    rpc.init_rpc(
        f"Test{rank}",
        rank=rank,
        world_size=world_size,
        backend=rpc.BackendType.PROCESS_GROUP,
        rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
            rpc_timeout=20, init_method="tcp://localhost:{}".format(RPC_PORT)
        ),
    )
    initialize_model_parallel(1, world_size)
    init_random_seed(0)
    run_mp_worker(args, world_size)

    rpc.shutdown()
    torch.distributed.destroy_process_group()
Beispiel #5
0
def run(i):
    global rank
    rank = i
    if i == world_size - 1:
        time.sleep(4)
        print("Process {} delayed start.".format(i))
    rpc.init_rpc("Rank" + str(i),
                 rank=i,
                 world_size=world_size,
                 rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                     init_method="env://",
                     rpc_timeout=rpc.timedelta(seconds=2),
                     num_send_recv_threads=4))
    if i == 0:
        time.sleep(2)
        print("Process 0 exit.")
        exit(-1)
    t = time.time()
    reqs = []
    for j in range(messages):
        for r in range(world_size):
            reqs.append(rpc.rpc_async("Rank{}".format(r), test, args=()))
        for req, idx in zip(reqs, range(world_size)):
            try:
                print("{} Received from {} : {}".format(rank, idx, req.wait()))
            except RuntimeError:
                print("An error ocurred while {} receiving results from {}".
                      format(rank, idx))
        reqs.clear()
    print(time.time() - t)
    rpc.shutdown(graceful=False)
Beispiel #6
0
    def __init__(self,
                 name: str,
                 rank: int = -1,
                 world_size: int = None,
                 init_method: str = "tcp://localhost:9100",
                 rpc_timeout: float = 60,
                 rpc_threads: int = 8):
        """
        Args:
            name: A unique name to identify current process.
            rank: A unique rank of the current process. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            world_size:   Size of the distributed world. You do not need to specify
                it if you are using `torch.distributed.launch` or `torchelastic`
            init_method:  Backend initialization method.
            rpc_timeout:  Global rpc call timeout in seconds.
            rpc_threads:  Rpc recv/send thread num.
        """
        self.world_size = world_size
        self.rank = rank
        self.name = name
        self.groups = {}
        self.group_create_signals = {}

        # "<rank-number>" is used as the unique name.
        rpc.init_rpc(self.name,
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                         init_method=init_method,
                         num_send_recv_threads=rpc_threads,
                         rpc_timeout=timedelta(seconds=rpc_timeout)
                     ))

        # get rank-name mapping
        self.rank_name_map = {}
        for wi in rpc._get_current_rpc_agent().get_worker_infos():
            self.rank_name_map[wi.id] = wi.name

        # Start role dispatching.
        self.started = True
        self.rpc_timeout = rpc_timeout

        # map for paired values and registered services
        self.value_lut = {}
        self.service_lut = {}
        self.lut_lock = Lock()
        self.lut_manager = self.rank_name_map[0]
Beispiel #7
0
def run(i):
    rpc.init_rpc("Rank" + str(i),
                 rank=i,
                 world_size=world_size,
                 rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                     init_method="env://",
                     rpc_timeout=rpc.timedelta(seconds=60),
                     num_send_recv_threads=4))
    t = time.time()
    reqs = []
    for j in range(messages):
        for r in range(world_size):
            reqs.append(rpc.rpc_async("Rank{}".format(r), test, args=()))
    for req in reqs:
        req.wait()
    print(time.time() - t)
    rpc.shutdown()
Beispiel #8
0
def bench_mpi(args):
    guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
    world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
    local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
    os.environ["UCX_NET_DEVICES"] = best_device_map[local_rank]

    os.environ["MASTER_ADDR"] = args.host
    os.environ["MASTER_PORT"] = "10638"
    if args.socket_name:
        os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name
        os.environ["TP_SOCKET_IFNAME"] = args.socket_name

    torch.distributed.init_process_group(backend="gloo",
                                         rank=guess_rank,
                                         world_size=world_size)

    os.environ["MASTER_ADDR"] = args.host
    os.environ["MASTER_PORT"] = "10639"
    init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()

    rpc.init_rpc(
        f"Test{rank}",
        rank=rank,
        world_size=world_size,
        backend=rpc.BackendType.PROCESS_GROUP,
        rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
            rpc_timeout=20, init_method=init_method),
    )

    backends = {
        "model_parallel_backend": "nccl",
        "pipeline_backend": "mpi",
        "ddp_backend": "nccl"
    }

    initialize_model_parallel(1, world_size, **backends)
    init_random_seed(0)

    run_mp_worker(args, world_size)

    rpc.shutdown()
    torch.distributed.destroy_process_group()
Beispiel #9
0
def run_worker(rank, world_size, num_split):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    options = rpc.ProcessGroupRpcBackendOptions(num_send_recv_threads=256)

    if rank == 0:
        rpc.init_rpc("master",
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=options)
        run_master(num_split)
    else:
        rpc.init_rpc(f"worker{rank}",
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=options)
        pass

    # block until all rpcs finish
    rpc.shutdown()
Beispiel #10
0
 def _init_torch_rpc_pg(
     self,
     master_addr,
     master_port,
     worker_idx,
     worker_num,
 ):
     # https://github.com/pytorch/pytorch/issues/55615
     # [BC-Breaking][RFC] Retire ProcessGroup Backend for RPC #55615
     str_init_method = "tcp://" + str(master_addr) + ":" + str(master_port)
     logging.info("str_init_method = {}".format(str_init_method))
     options = rpc.ProcessGroupRpcBackendOptions(
         num_send_recv_threads=4, init_method=str_init_method)
     rpc.init_rpc(
         WORKER.format(worker_idx),
         backend=dist.rpc.BackendType.PROCESS_GROUP,
         rank=worker_idx,
         world_size=worker_num,
         rpc_backend_options=options,
     )
     # torch.distributed.rpc.init_rpc('worker', rank=self.global_rank, world_size=self.world_size)
     logging.info("_init_rpc_with_process_group finished.")
Beispiel #11
0
def dist_init(rank: int,
              world_size: int,
              filename: str,
              filename_rpc: str = "") -> bool:
    """
    Initialize torch distributed, based on a temporary file shared across ranks, which makes it possible for unrelated
    tests to be run concurrently.

    Return false if not enough GPUs present in the system.

    .. warning: This limits the usecase to all ranks being on the same node
    """

    try:
        torch.distributed.rpc.shutdown()
    except Exception:
        pass

    print(f"dist init r={rank}, world={world_size}")

    os.environ["WORLD_SIZE"] = str(world_size)
    os.environ["RANK"] = str(rank)
    url = "file://" + filename
    url_rpc = "file://" + filename_rpc

    if torch_version() >= (1, 6, 0):
        backend = "nccl" if torch.cuda.is_available() else "gloo"
        if backend == "nccl" and torch.cuda.device_count() < world_size:
            logging.warning(
                "Requested world size cannot be reached on this machine, not enough GPUs"
            )
            return False

        torch.distributed.init_process_group(backend=backend,
                                             rank=rank,
                                             world_size=world_size,
                                             init_method=url)

        rpc.init_rpc(
            f"Test{rank}",
            rank=rank,
            world_size=world_size,
            backend=rpc.BackendType.TENSORPIPE,
            rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
                init_method=url_rpc),
        )

    else:
        if world_size > 1:
            # TensorPipe is not available in Torch 1.5
            rpc.init_rpc(
                name=f"Test{rank}",
                rank=rank,
                world_size=world_size,
                rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                    init_method=url_rpc),
            )
        elif torch.cuda.is_available():
            torch.distributed.init_process_group(backend="nccl",
                                                 rank=rank,
                                                 world_size=world_size,
                                                 init_method=url)
        else:
            return False

    if torch.cuda.is_available() and torch.cuda.device_count():
        torch.cuda.set_device(rank % torch.cuda.device_count())

    return True
Beispiel #12
0
def dist_init(rank: int,
              world_size: int,
              filename: str,
              filename_rpc: str = "") -> bool:
    """
    Initialize torch distributed, based on a temporary file shared across ranks, which makes it possible for unrelated
    tests to be run concurrently.

    Return false if not enough GPUs present in the system.

    .. warning: This limits the usecase to all ranks being on the same node
    """

    try:
        torch.distributed.rpc.shutdown()
    except Exception:
        pass

    print(f"dist init r={rank}, world={world_size}")

    os.environ["WORLD_SIZE"] = str(world_size)
    os.environ["RANK"] = str(rank)
    url = "file://" + filename
    url_rpc = "file://" + filename_rpc

    if torch_version() >= (1, 6, 0):
        backend = "nccl" if torch.cuda.is_available() else "gloo"
        if backend == "nccl" and torch.cuda.device_count() < world_size:
            logging.warning(
                "Requested world size cannot be reached on this machine, not enough GPUs"
            )
            return False

        torch.distributed.init_process_group(backend=backend,
                                             rank=rank,
                                             world_size=world_size,
                                             init_method=url)

        tp_options = {"init_method": url_rpc}
        # Workaround for bug in torch v1.8.0. Should be fixed in v1.8.1
        if torch_version() == (1, 8, 0):
            if torch.cuda.is_available():
                # Workaround for https://github.com/pytorch/pytorch/issues/53844
                tp_options["_transports"] = ["ibv", "uv"]  # type: ignore
            else:
                # Workaround for https://github.com/pytorch/pytorch/issues/54266
                tp_options["_channels"] = [
                    "mpt_uv", "basic", "cuda_ipc", "cuda_gdr", "cuda_xth",
                    "cuda_basic"
                ]  # type: ignore

        rpc.init_rpc(
            f"Test{rank}",
            rank=rank,
            world_size=world_size,
            backend=rpc.BackendType.TENSORPIPE,
            rpc_backend_options=rpc.TensorPipeRpcBackendOptions(**tp_options),
        )

    else:
        if world_size > 1:
            # TensorPipe is not available in Torch 1.5
            rpc.init_rpc(
                name=f"Test{rank}",
                rank=rank,
                world_size=world_size,
                rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                    init_method=url_rpc),
            )
        elif torch.cuda.is_available():
            torch.distributed.init_process_group(backend="nccl",
                                                 rank=rank,
                                                 world_size=world_size,
                                                 init_method=url)
        else:
            return False

    if torch.cuda.is_available() and torch.cuda.device_count():
        torch.cuda.set_device(rank % torch.cuda.device_count())

    return True
Beispiel #13
0
    def __init__(self,
                 world_size: int,
                 current_rank: int,
                 roles: Dict[str, Tuple[type, int]],
                 init_method: str = "tcp://localhost:9100",
                 rpc_timeout: int = 60,
                 rpc_threads: int = 4,
                 rpc_role_dispatcher: Any = None):
        """
        Args:
            world_size:   Size of distributed world.
            current_rank: A unique rank of current process.
            roles: A list of roles executed by all processes.
            init_method:  Backend initialization method.
            rpc_timeout:  Global rpc call timeout in seconds.
            rpc_threads:  Rpc recv/send thread num.
            rpc_role_dispatcher: Rpc role dispatch, by default it is
                :class:`~machin.parallel.distributed.\
RoleDispatcherElection` and uses :class:`machin.parallel.\
distributed.ElectionGroupStableRpc` as its internal election implementation.
        """
        self.world_size = world_size
        self.role_dict = roles
        # Maps role Tuple[str, int] to threads
        self.role_threads = {}

        self.current_rank = current_rank
        self.ranks = [i for i in range(world_size)]
        self.real_names = ["{}".format(i) for i in range(world_size)]
        self.groups = {}
        if rpc_role_dispatcher is not None:
            self.rpc_role_dispatcher = rpc_role_dispatcher
        else:
            role_names = list(roles.keys())
            role_nums = [val[1] for val in roles.values()]
            self.rpc_role_dispatcher = RoleDispatcherElection(
                current_rank, world_size, role_names, role_nums,
                ElectionGroupStableRpc(name="global",
                                       member_ranks=self.ranks,
                                       rank=current_rank,
                                       timeout=rpc_timeout))

        # "<rank-number>" is used as the unique name.
        rpc.init_rpc("{}".format(self.current_rank),
                     rank=current_rank,
                     world_size=world_size,
                     rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                         init_method=init_method,
                         num_send_recv_threads=rpc_threads,
                         rpc_timeout=timedelta(seconds=rpc_timeout)))

        # Start role dispatching.
        self.rpc_role_dispatcher.start()
        while True:
            self.rpc_role_dispatcher.get_role_update_cond().wait()
            for role in self.rpc_role_dispatcher.get_roles():
                if role not in self.role_threads:
                    role_class = self.role_dict[role[0]][0]
                    role_thread = Thread(target=_exec_role,
                                         args=(role_class(role[1]), ))
                    role_thread.start()
                    self.role_threads[role] = role_thread