def __init__(self, args): super().__init__() self.num_workers = distributed_utils.get_data_parallel_world_size() expert_centroids = torch.empty(self.num_workers, args.decoder_embed_dim) torch.nn.init.orthogonal_(expert_centroids, gain=0.1) self.register_parameter("expert_centroids", torch.nn.Parameter(expert_centroids)) self.expert_network = nn.Sequential(*([BaseSublayer(args) for _ in range(args.base_sublayers)])) self.expert_id = distributed_utils.get_data_parallel_rank() self.shuffle = args.base_shuffle self.cpp = self.load_assignment() # Add a special attribute to the expert parameters, so we know not to sync their gradients for param in self.expert_network.parameters(): param.expert = True
def __init__(self, cfg: TruncatedBPTTLMConfig): super().__init__(cfg) if cfg.data_parallel_rank is None or cfg.data_parallel_size is None: if torch.distributed.is_initialized(): cfg.data_parallel_rank = dist_utils.get_data_parallel_rank() cfg.data_parallel_size = dist_utils.get_data_parallel_world_size() else: cfg.data_parallel_rank = 0 cfg.data_parallel_size = 1 # load the dictionary paths = utils.split_paths(cfg.data) assert len(paths) > 0 self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(self.dictionary)))
def data_parallel_rank(self): if self.cfg.distributed_training.distributed_world_size == 1: return 0 return distributed_utils.get_data_parallel_rank()