Ejemplo n.º 1
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.set_world_ranks(process_idx)
     rank_zero_only.rank = self.global_rank
     init_dist_connection(self.cluster_environment,
                          self.torch_distributed_backend, self.global_rank,
                          self.world_size)
Ejemplo n.º 2
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.set_world_ranks(process_idx)
     rank_zero_only.rank = self.global_rank
     self._process_group_backend = self._get_process_group_backend()
     init_dist_connection(self.cluster_environment,
                          self._process_group_backend, self.global_rank,
                          self.world_size)
 def setup_environment(self) -> None:
     reset_seed()
     # set warning rank
     rank_zero_only.rank = self.global_rank
     self._process_group_backend = self._get_process_group_backend()
     assert self.cluster_environment is not None
     init_dist_connection(self.cluster_environment,
                          self._process_group_backend)
     super().setup_environment()
Ejemplo n.º 4
0
    def setup_distributed(self):
        log.detail(f"{self.__class__.__name__}: setting up distributed...")
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        self._process_group_backend = self._get_process_group_backend()
        init_dist_connection(self.cluster_environment,
                             self._process_group_backend)
Ejemplo n.º 5
0
    def setup_distributed(self):
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        init_dist_connection(self.cluster_environment,
                             self.torch_distributed_backend)