def _create_nccl_pg(self, name_prefix): tcp_store = create_tcp_store(jit_class=True) opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True) name = unique_process_group_name(name_prefix) return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)
def __init__(self): super(TestModule, self).__init__() tcp_store = create_tcp_store(jit_class=True) name = unique_process_group_name("module_member_process_group") self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper( 1, 0, [], "nccl", tcp_store, name, 0)
def _multi_worker_helper(self, world_size): addr = DEFAULT_HOSTNAME server_store = create_tcp_store(addr, world_size, wait_for_workers=False) server_store.set("key", "value") port = server_store.port world_size = random.randint(5, 10) if world_size == -1 else world_size for i in range(world_size): self._create_client(i, addr, port, world_size)
def _multi_worker_helper(self, world_size): addr = DEFAULT_HOSTNAME server_store = create_tcp_store(addr, world_size, wait_for_workers=False) server_store.set("key", "value") port = server_store.port num_indices = world_size if world_size else 1 for i in range(num_indices): self._create_client(i, addr, port, world_size)
def test_frontend_singleton(self): frontend1 = torch.classes.dist_c10d.frontend() frontend2 = torch.classes.dist_c10d.frontend() tcp_store = create_tcp_store(jit_class=True) pg_name = unique_process_group_name("singleton_test_process_group") ProcessGroupNCCL1 = frontend1.new_process_group_helper( self.world_size, self.rank, [], "nccl", tcp_store, pg_name, 0) ProcessGroupNCCL2 = frontend2.get_process_group_by_name(pg_name) self.assertEqual(frontend2.get_name_of_process_group(ProcessGroupNCCL2), pg_name)
def _multi_worker_helper(self, world_size): addr = DEFAULT_HOSTNAME server_store = create_tcp_store(addr, world_size, wait_for_workers=False) server_store.set("key", "value") port = server_store.port messages = mp.Queue() processes = [] num_proccesses = random.randint(3, 5) if world_size == -1 else world_size for i in range(num_proccesses): p = mp.Process(target=self._create_client, args=(i, addr, port, world_size, messages)) processes.append(p) p.start() for p in processes: p.join() error_message = "" while not messages.empty(): error_message += messages.get() + "\n" if any([p.exitcode != 0 for p in processes]): raise RuntimeError(error_message)
def _create_nccl_pg_as_base_process_group(self, name): tcp_store = create_tcp_store(jit_class=True) return torch.classes.dist_c10d.frontend().new_process_group_helper( self.world_size, self.rank, [], "nccl", tcp_store, name, 0)
def setUp(self): super(PrefixTCPStoreTest, self).setUp() self.tcpstore = create_tcp_store() self.prefix = "test_prefix" self.tcpstore.set_timeout(timedelta(seconds=300))
def _create_store(self): store = create_tcp_store() store.set_timeout(timedelta(seconds=300)) return store