def test_total_parameters(self): ident = IdentityLayer() emb = torch.nn.Embedding(32, 8) # 32 * 8 = 256 emb2 = torch.nn.Embedding(32, 16) # 32 * 16 = 128 emb2.weight.requires_grad = False assert total_parameters(emb) == 256 assert total_parameters(ident) == 0 assert total_parameters(emb2) == 512 assert total_parameters(torch.nn.ModuleList([ident, emb, emb2])) == 768
def __init__(self, opt, dict): self.add_start_token = opt["add_start_token"] super().__init__(*self._get_special_tokens(opt, dict)) # init the model self.encoder = IdentityLayer() self.decoder = self._get_decoder(opt, dict) self.config = self.decoder.transformer.config self.lm_head = torch.nn.Linear(self.config.n_embd, self.config.vocab_size, bias=False) self._tie_weights(self.lm_head, self.decoder.transformer.wte)
def test_schedule_work_items(self): # test that we schedule things correctly # pretend we have 8 layers and 4 gpus, and they are unevenly distributed model = torch.nn.ModuleList() for i in range(8): layer = IdentityLayer() if i == 0: layer._mp_gpu = 'cuda:0' elif i in (1, 2, 3): layer._mp_gpu = 'cuda:1' elif i in (4, 5): layer._mp_gpu = 'cuda:2' elif i in (6, 7): layer._mp_gpu = 'cuda:3' model.append(layer) # there are 2 chunks, each 16 x 7 in size chunks = PipelineHelper.split(torch.randn(32, 7), 16) work_items = list(PipelineHelper.schedule_work_items(model, chunks)) assert len(work_items) == 8 assert work_items[0].layer_nos == [0] and work_items[0].chunk_idx == 0 assert work_items[1].layer_nos == [1, 2, 3 ] and work_items[1].chunk_idx == 0 assert work_items[2].layer_nos == [0] and work_items[2].chunk_idx == 1 assert work_items[3].layer_nos == [4, 5 ] and work_items[3].chunk_idx == 0 assert work_items[4].layer_nos == [1, 2, 3 ] and work_items[4].chunk_idx == 1 assert work_items[5].layer_nos == [6, 7 ] and work_items[5].chunk_idx == 0 assert work_items[6].layer_nos == [4, 5 ] and work_items[6].chunk_idx == 1 assert work_items[7].layer_nos == [6, 7 ] and work_items[7].chunk_idx == 1
def __init__(self, opt, dict): self.null_idx, self.start_idx, self.end_idx = self._get_special_tokens( opt, dict) super().__init__(self.null_idx, self.start_idx, self.end_idx) # init the model self.encoder = IdentityLayer() self.decoder = self._get_decoder(opt, dict) self.config = self.decoder.transformer.config self.lm_head = torch.nn.Linear(self.config.n_embd, self.config.vocab_size, bias=False) self._tie_weights(self.lm_head, self.decoder.transformer.wte) # add start token self.add_start_token = opt["add_special_tokens"] and opt[ "add_start_token"] # used to reverse concatenation of context and labels self.text_lengths = None
def _get_model(): model = torch.nn.Module() model.layers = torch.nn.ModuleList( [IdentityLayer() for _ in range(8)]) return model