def forward(self, data, target, *mems): # nn.DataParallel does not allow size(0) tensors to be broadcasted. # So, have to initialize size(0) mems inside the model forward. # Moreover, have to return new_mems to allow nn.DataParallel to piece # them together. if not mems: mems = self.init_mems() tgt_len = target.size(0) # print(f'data = {data.shape}') hidden, new_mems = self._forward(data, mems=mems) pred_hid = hidden[ -tgt_len:] # 预测的结果,利用了之前的信息,hidden维度: (mems + tgt_len) * 4 * 200 即 36 x 4 x 200 # sample_softmax 随机选择n个词就行softmax,否则所有词 if self.sample_softmax > 0 and self.training: # self.tie_weight 控制是否共享词向量参数 self.out_layer:就是转化为词表概率的线性层 assert self.tie_weight logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler) loss = -F.log_softmax(logit, -1)[:, :, 0] else: # print(f'pred_hid shape = {pred_hid.shape}') # pred_hid.view(-1, pred_hid.size(-1)): 144 x 200, target.view(-1): 144 loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) # # print(f'loss = {loss.shape}') loss = loss.view(tgt_len, -1) # 36 x 4 if new_mems is None: return [loss] else: return [loss] + new_mems
def forward(self, data, target, *mems): # nn.DataParallel does not allow size(0) tensors to be broadcasted. # So, have to initialize size(0) mems inside the model forward. # Moreover, have to return new_mems to allow nn.DataParallel to piece # them together. if not mems: mems = self.init_mems() tgt_len = target.size(0) hidden, new_mems = self._forward(data, mems=mems) pred_hid = hidden[-tgt_len:] if self.sample_softmax > 0 and self.training: assert self.tie_weight logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler) loss = -F.log_softmax(logit, -1)[:, :, 0] else: loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) loss = loss.view(tgt_len, -1) if new_mems is None: return [loss] else: return [loss] + new_mems
def forward(self, data, target, mems): # nn.DataParallel does not allow size(0) tensors to be broadcasted. # So, have to initialize size(0) mems inside the model forward. # Moreover, have to return new_mems to allow nn.DataParallel to piece # them together. if mems[0] == (): mems_real = self.init_mems() mems_phase = self.init_mems() else: mems_real, mems_phase = mems tgt_len = target.size(0) hidden, hidden_phase, new_mems, new_mems_phase = self._forward( data, mems=mems_real, mems_phase=mems_phase) norms = (torch.sqrt( torch.mul(hidden, hidden) + torch.mul(hidden_phase, hidden_phase))) / 1.5 pred_hid = norms[-tgt_len:] if self.sample_softmax > 0 and self.training: assert self.tie_weight logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler) loss = -F.log_softmax(logit, -1)[:, :, 0] else: loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) loss = loss.view(tgt_len, -1) if new_mems is None: return [loss] else: return loss, new_mems, new_mems_phase
def forward(self, data, target, *mems, model_type="training"): # nn.DataParallel does not allow size(0) tensors to be broadcasted. # So, have to initialize size(0) mems inside the model forward. # Moreover, have to return new_mems to allow nn.DataParallel to piece # them together. if not mems: mems = self.init_mems() tgt_len = target.size(0) hidden, new_mems = self._forward(data, mems=mems) pred_hid = hidden[-tgt_len:] #print("### self.sample_softmax ###",self.sample_softmax) if self.sample_softmax > 0 and model_type == "training": #if self.sample_softmax > 0 and self.training: assert self.tie_weight logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler) loss = -F.log_softmax(logit, -1)[:, :, 0] elif model_type == "inferrence": output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target, False, model_type) #outputTest=output.view(tgt_len,8,-1) #loss = loss.view(tgt_len, -1) return [output] + new_mems else: loss, output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) outputTest = output.view(tgt_len, 1, -1) loss = loss.view(tgt_len, -1) if new_mems is None: return [loss] + [output] else: return [loss] + [output] + new_mems
def forward(self, data, target, *mems, use_dropout=True, reg_args=None): # nn.DataParallel does not allow size(0) tensors to be broadcasted. # So, have to initialize size(0) mems inside the model forward. # Moreover, have to return new_mems to allow nn.DataParallel to piece # them together. drop_list = [] if not mems: mems = self.init_mems() tgt_len = target.size(0) if reg_args is None: ret_dropped = False sample_logit = False else: ret_dropped = reg_args['exp_reg_type'] != 'none' or reg_args[ 'imp_reg_type'] != 'none' sample_logit = reg_args['exp_reg_type'].split( '+')[0] == 'jreg_sample_logit' if ret_dropped: hidden, new_mems, drop_list = self._forward( data, mems=mems, use_dropout=use_dropout, ret_dropped=True) else: hidden, new_mems = self._forward(data, mems=mems, use_dropout=use_dropout, ret_dropped=False) pred_hid = hidden[-tgt_len:] if self.sample_softmax > 0 and self.training: if sample_logit: raise NotImplementedError assert self.tie_weight logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler) loss = -F.log_softmax(logit, -1)[:, :, 0] fake_loss = None else: loss_ret = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1), sample_losses=sample_logit) loss = loss_ret if not sample_logit else loss_ret[0] loss = loss.view(tgt_len, -1) fake_loss = None if not sample_logit else loss_ret[1].view(-1) reg_arr = [] if reg_args is not None: exp_reg = compute_exp_reg(loss, fake_loss, pred_hid, drop_list, reg_args) imp_reg = compute_imp_reg(loss, pred_hid, drop_list, reg_args) if reg_args['exp_reg_type'] != 'none': reg_arr.append(exp_reg) if reg_args['imp_reg_type'] != 'none': reg_arr.append(imp_reg) if new_mems is None: return [loss] + reg_arr else: return [loss] + reg_arr + new_mems