def train(i, batch_size, use_cuda, dropout): input = batch_loader.next_batch(batch_size, 'train') input = [Variable(t.from_numpy(var)) for var in input] input = [var.long() for var in input] input = [var.cuda() if use_cuda else var for var in input] [ encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target ] = input logits, _, kld = self(dropout, encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, z=None) logits = logits.view(-1, self.params.word_vocab_size) target = target.view(-1) cross_entropy = F.cross_entropy(logits, target) loss = 79 * cross_entropy + kld_coef(i) * kld optimizer.zero_grad() loss.backward() optimizer.step() return cross_entropy, kld, kld_coef(i)
def train(i, batch_size, use_cuda, dropout, start_index): input = batch_loader.next_batch(batch_size, 'train', start_index) input = [Variable(t.from_numpy(var)) for var in input] input = [var.long() for var in input] input = [var.cuda() if use_cuda else var for var in input] #这里是data/train.txt,转换变成embedding,用pand补齐, #其中encoder_word_input, encoder_character_input是将 xo原始句输入倒过来前面加若干占位符, # decoder_word_input, decoder_character_input是 xo原始句加了开始符号末端补齐 # target,结束句子后面加了结束符,target是xo原始句加结束符后面加若干占位符 [ encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target ] = input ''' =================================================== Input for Encoder-2 ======================================================== ''' input_2 = batch_loader_2.next_batch(batch_size, 'train', start_index) input_2 = [Variable(t.from_numpy(var)) for var in input_2] input_2 = [var.long() for var in input_2] input_2 = [var.cuda() if use_cuda else var for var in input_2] #这里是data/super/train.txt,转换变成embedding,用pand补齐, #其中encoder_word_input, encoder_character_input是将 释义句xp输入倒过来前面加若干占位符, # decoder_word_input, decoder_character_input是 释义句xp加了开始符号末端补齐 # target,结束句子后面加了结束符,target是释义句xp加结束符后面加若干占位符 [ encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target ] = input_2 ''' ================================================================================================================================ ''' # exit() #这里encoder-input是原始句子xo的输入(句子翻转),encoder-input2是释义句xp的输入(句子翻转),decoder-input是释义句加加开始符号 logits, _, kld, _, _ = self(dropout, encoder_word_input, encoder_character_input, encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, z=None) # logits = logits.view(-1, self.params.word_vocab_size) logits = logits.view(-1, self.params_2.word_vocab_size) target = target.view(-1) cross_entropy = F.cross_entropy( logits, target ) #前面logit 是每一步输出的词汇表所有词的概率, target是每一步对应的词的索引不用变成onehot,函数内部做变换 loss = 79 * cross_entropy + kld_coef(i) * kld #79应该是作者拍脑袋的 optimizer.zero_grad() #标准用法先计算损失函数值,然后初始化梯度为0, loss.backward() #然后反向传递 optimizer.step() #反向跟新梯度 return cross_entropy, kld, kld_coef( i) # 交叉熵,kl-devergence,kld-coef是为了让他
def train(i, batch_size, use_cuda, dropout): input = batch_loader.next_batch(batch_size, 'train') input = [(Variable(t.from_numpy(var)) if var is not None else None) for var in input] input = [(var.long() if var is not None else None) for var in input] input = [(var.cuda() if var is not None and use_cuda else var) for var in input] [ encoder_word_input, encoder_character_input, decoder_word_input, _, target ] = input logits_out, kld, _, _ = self(dropout, encoder_word_input, encoder_character_input, decoder_word_input, z=None, initial_state=None) if self.params.decoder_type == 'dilation' or self.params.decoder_type == 'gru' or self.params.decoder_type == 'lstm': logits = logits_out.view(-1, self.params.word_vocab_size) target = target.view(-1) cross_entropy = F.cross_entropy(logits, target) # since cross enctropy is averaged over seq_len, it is necessary to approximate new kld loss = 79 * cross_entropy + kld_coef(i) * kld logits = logits.view(batch_size, -1, self.params.word_vocab_size) target = target.view(batch_size, -1) ppl = perplexity(logits, target).mean() optimizer.zero_grad() loss.backward() optimizer.step() return ppl, kld, None elif self.params.decoder_type == 'gru_emb': decoder_target = self.embedding(target, None) error = t.pow(logits_out - decoder_target, 2).mean() ''' loss is constructed fromaveraged over whole batches error formed from squared error between output and target and KL Divergence between p(z) and q(z|x) ''' loss = 400 * error + kld_coef(i) * kld optimizer.zero_grad() loss.backward() optimizer.step() return error, kld, kld_coef(i)
def train(i, input, use_cuda, dropout): input = [ Variable(torch.from_numpy(var.astype(np.float))) for var in input ] input = [var.long() for var in input] input = [var.cuda() if use_cuda else var for var in input] [ original_encoder_word_input, original_encoder_character_input, paraphrse_encoder_word_input, paraphrse_encoder_character_input, decoder_word_input, decoder_character_input, target ] = input logits, _, kld = self(dropout, original_encoder_word_input, original_encoder_character_input, paraphrse_encoder_word_input, paraphrse_encoder_character_input, decoder_word_input, decoder_character_input, z=None, initial_state=None) logits = logits.view(-1, self.params.word_vocab_size) target = target.view(-1) cross_entropy = F.cross_entropy(logits, target) loss = 79 * cross_entropy + kld_coef(i) * kld optimizer.zero_grad() loss.backward() optimizer.step() return cross_entropy, kld, kld_coef(i)