def translate(model, src_vocab, trg_vocab, corpus_iter, translation_output=None): global opt model.eval() hyp_list = [] for idx, batch in enumerate(corpus_iter, start=1): print(idx) batch = list(batch) src_raw = batch[0] src = batch_str2idx_with_flag(src_raw, src_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS) src = to_Tensor(src, tensor_type=torch.LongTensor, cuda=opt.cuda) src_mask = get_batch_mask(src, src_vocab, PAD) with torch.no_grad(): sentences_output, scores_output = model.beamsearch(src, src_mask, opt.beam_size, normalize=True) best_sentence, best_score = sentences_output[0], scores_output[0] best_sentence = batch_idx2str([best_sentence], trg_vocab) hyp_list.append(best_sentence[0]) with open(translation_output, 'w') as f: for sentence in hyp_list: sentence = ' '.join(sentence) f.write(sentence + '\n')
def evaluate(opt, model, src_vocab, trg_vocab, corpus_iter, batch_idx, cur_epoch): try: model.eval() print('!!!eval', id(model)) time1 = time.time() hyp_list = [] ref_list = [] print('sub: ', os.getpid()) print('num: ', batch_idx) for idx, batch in enumerate(corpus_iter, start=1): print(idx) src_raw = batch[0] trg_raw = batch[1:] ref = list(map(lambda x: x[0], trg_raw)) ref_list.append(ref) src = batch_str2idx_with_flag(src_raw, src_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS) src = to_Tensor(src, tensor_type=torch.LongTensor, cuda=opt.cuda) src_mask = get_batch_mask(src, src_vocab, PAD) with torch.no_grad(): sentences_output, scores_output = model.beamsearch( src, src_mask, opt.beam_size, normalize=True) best_sentence, best_score = sentences_output[0], scores_output[ 0] best_sentence = batch_idx2str([best_sentence], trg_vocab) hyp_list.append(best_sentence[0]) bleu = corpus_bleu(ref_list, hyp_list, smoothing_function=SmoothingFunction().method1) time1 = time.time() - time1 print('subprocess %d batch_idx %d, time: ' % (os.getpid(), batch_idx), time1) return bleu, batch_idx, cur_epoch except Exception as ex: msg = "subprcess wrong: %s" % ex print(msg)
def greedysearch(self, src, src_mask, max_len=None, min_len=None, cuda=False): max_len = src.size(1) * 3 if max_len is None else max_len min_len = src.size(1) / 2 if min_len is None else min_len src_seq_lens = src_mask.sum(1) enc_emb = self.encoder.src_emb(src) enc_pos_emb = position_encoding_init(enc_emb.size(1), enc_emb.size(2), cuda=enc_emb.is_cuda) enc_input = enc_emb + enc_pos_emb # add positional embeddings to input embeddings enc_output = enc_input for enc_layer in self.encoder.layer_stack: enc_output = enc_layer(enc_output, src_seq_lens) sentence = [self.dec_sos] output = to_Tensor([sentence], tensor_type=torch.LongTensor, cuda=cuda) for k in range(max_len): dec_emb = self.decoder.dec_emb(output) dec_pos_emb = position_encoding_init(dec_emb.size(1), dec_emb.size(2), cuda=dec_emb.is_cuda) dec_input = dec_emb + dec_pos_emb dec_layer_output = dec_input for dec_layer in self.decoder.layer_stack: multi_head_input = dec_layer_output.unsqueeze(1).expand( -1, self.h, -1, -1) multi_head_q, multi_head_k, multi_head_v = multi_head_input, multi_head_input, multi_head_input self_attn_output = dec_layer.masked_multi_head_attn( multi_head_q, multi_head_k, multi_head_v) residual_output1 = dec_layer.layer_norm1( dec_layer_output + self_attn_output) # Add(residual connection) & Norm multi_head_q = residual_output1.unsqueeze(1).expand( -1, self.h, -1, -1) multi_head_input = enc_output.unsqueeze(1).expand( -1, self.h, -1, -1) multi_head_k, multi_head_v = multi_head_input, multi_head_input dec_enc_attn_output = dec_layer.multi_head_attn( multi_head_q, multi_head_k, multi_head_v) residual_output2 = dec_layer.layer_norm2( residual_output1 + dec_enc_attn_output) # Add(residual connection) & Norm # Position-wise Feedforward Sublayer feedforwad_output = dec_layer.pos_ffn(residual_output2) dec_layer_output = dec_layer.layer_norm3( residual_output2 + feedforwad_output) # Add(residual connection) & Norm dec_output = dec_layer_output y_prob = self.affine(dec_output) cur_word_idx = int(y_prob[0][k].argmax()) #if cur_word_idx is self.dec_eos: #break sentence.append(cur_word_idx) output = to_Tensor([sentence], tensor_type=torch.LongTensor, cuda=cuda) return sentence
def train(model, src_vocab, trg_vocab, optim_wrapper, train_iter, vldt_iter, loss_function): global opt, min_loss, max_bleu subprocess_pool = Pool(2) model.train() print('start training!!!', id(model)) for epoch in range(opt.epoch, opt.nepoch): # TODO cur_epoch = epoch + 1 total_loss = 0 print('############### epoch = %d ###############\n' % cur_epoch) for batch_idx, batch in enumerate(train_iter, start=1): sorted_batch = sort_batch(batch) src_raw = sorted_batch[0] trg_raw = sorted_batch[1] # 获得以word indices表示的源句子和目标语句 src = batch_str2idx_with_flag(src_raw, src_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS) f_trg = batch_str2idx_with_flag(trg_raw, trg_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS) src, f_trg = to_Tensor(src, f_trg, tensor_type=torch.LongTensor, cuda=opt.cuda) src_mask = get_batch_mask(src, src_vocab, PAD) f_trg_mask = get_batch_mask(f_trg, trg_vocab, PAD) ''' # b_trg = batch_str2idx_with_flag(trg_raw, trg_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS, reverse=True) # 目标端反向的句子batch,暂时不用 # src, f_trg, b_trg = to_Tensor(src, f_trg, b_trg, tensor_type=torch.LongTensor, cuda=opt.cuda) # b_trg_mask = get_batch_mask(b_trg, trg_vocab, PAD) ''' y_prob = model(src, src_mask, f_trg, f_trg_mask) # --------------------------------------- TODO f_trg = torch.cat( (f_trg, torch.LongTensor([[dec_pad] for _ in range(int(f_trg.size(0)))])), 1) loss = loss_function(y_prob.transpose(1, 2), f_trg[:, 1:]) total_loss = total_loss + float(loss) loss.backward() # ---------------------------------------- if batch_idx % opt.interval == 0: total_loss = total_loss / opt.interval if total_loss < min_loss: print('& epoch = %d batch_idx = %d min_loss = %f &\n' % (cur_epoch, batch_idx / opt.interval, total_loss)) min_loss = total_loss save_min_loss_model(model, opt.checkpoint_dir, batch_idx / opt.interval, cur_epoch, min_loss, info='Transformer_min_loss_model') else: print('- batch_idx = %d, loss = %f -\n' % (batch_idx / opt.interval, total_loss)) #torch.nn.utils.clip_grad_norm_(model.parameters(), opt.max_norm, norm_type=2) # 参数更新前执行梯度裁剪,默认取L2范数 optim_wrapper.step() optim_wrapper.zero_grad() total_loss = 0 optim_wrapper.update_lr_per_step() ''' # 开启额外cpu进程测试开发集bleu时调用下面语句 # 从第4轮训练开始,每隔opt.vldt_freq个batch,另开子进程测试一次bleu if cur_epoch >= 4 and (batch_idx * opt.interval) % opt.vldt_freq == 0: cpu_model = copy.deepcopy(model).cpu() subprocess_pool.apply_async(evaluate, args=(opt, cpu_model, src_vocab, trg_vocab, vldt_iter, batch_idx, cur_epoch), callback=my_callback) ''' if (batch_idx / opt.interval) % 100 == 0: print('- epoch = %d, min_loss = %f -\n' % (cur_epoch, min_loss)) # --------------------------------------- sentences = [] for i in range(5): sentence = [] for j in range(y_prob.size(1)): sentence.append(int(y_prob[i][j].argmax())) sentences.append(sentence) sentences = batch_idx2str(sentences, trg_vocab) for i in range(5): print('source:') print(' '.join(src_raw[i])) print('ref:') print(' '.join(trg_raw[i])) print('pred:') print(' '.join(sentences[i])) print('---------------------') # --------------------------------------- optim_wrapper.zero_grad() optim_wrapper.update_lr_per_epoch() save_checkpoint_model(model, opt.checkpoint_dir, cur_epoch, info='Transformer_checkpoint_model') print('$ min_loss: %f, max_bleu: %f $\n' % (min_loss, max_bleu)) # 关闭进程池等待开发集bleu测试完成 subprocess_pool.close() subprocess_pool.join()
def train(model, src_vocab, trg_vocab, optim_wrapper, train_iter, vldt_iter): global opt, min_loss, max_bleu subprocess_pool = Pool(2) # start training model.train() print('!!!train', id(model)) for epoch in range(opt.epoch, opt.nepoch): cur_epoch = epoch + 1 total_loss = 0 print('############### epoch = %d ###############\n' % cur_epoch) for batch_idx, batch in enumerate(train_iter, start=1): sorted_batch = sort_batch(batch) src_raw = sorted_batch[0] trg_raw = sorted_batch[1] # 获得以word indices表示的源句子和目标语句 src = batch_str2idx_with_flag(src_raw, src_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS) f_trg = batch_str2idx_with_flag(trg_raw, trg_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS) src, f_trg = to_Tensor(src, f_trg, tensor_type=torch.LongTensor, cuda=opt.cuda) src_mask = get_batch_mask(src, src_vocab, PAD) f_trg_mask = get_batch_mask(f_trg, trg_vocab, PAD) ''' # b_trg = batch_str2idx_with_flag(trg_raw, trg_vocab, unk=UNK, pad=PAD, sos=SOS, eos=EOS, reverse=True) # 目标端反向的句子batch,暂时不用 # src, f_trg, b_trg = to_Tensor(src, f_trg, b_trg, tensor_type=torch.LongTensor, cuda=opt.cuda) # b_trg_mask = get_batch_mask(b_trg, trg_vocab, PAD) ''' loss = model(src, src_mask, f_trg, f_trg_mask) # TODO total_loss = total_loss + float(loss) loss.backward() if batch_idx % opt.interval == 0: total_loss = total_loss / opt.interval if total_loss < min_loss: print('& epoch = %d batch_idx = %d min_loss = %f &\n' % (cur_epoch, batch_idx / opt.interval, total_loss)) min_loss = total_loss save_min_loss_model(model, opt.checkpoint_dir, batch_idx / opt.interval, cur_epoch, min_loss, info='RNNSearch_min_loss_model') else: print('- batch_idx = %d, loss = %f -\n' % (batch_idx / opt.interval, total_loss)) torch.nn.utils.clip_grad_norm_( model.parameters(), opt.max_norm, norm_type=2) # 参数更新前执行梯度裁剪,默认取L2范数 optim_wrapper.step() optim_wrapper.zero_grad() total_loss = 0 optim_wrapper.update_lr_per_step() ''' # 开启额外cpu进程测试开发集bleu时调用下面语句 # 从第4轮训练开始,每隔opt.vldt_freq个batch,另开子进程测试一次bleu if cur_epoch >= 4 and (batch_idx * opt.interval) % opt.vldt_freq == 0: cpu_model = copy.deepcopy(model).cpu() subprocess_pool.apply_async(evaluate, args=(opt, cpu_model, src_vocab, trg_vocab, vldt_iter, batch_idx, cur_epoch), callback=my_callback) ''' optim_wrapper.zero_grad() optim_wrapper.update_lr_per_epoch() save_checkpoint_model(model, opt.checkpoint_dir, cur_epoch, info='RNNSearch_checkpoint_model') print('$ min_loss: %f, max_bleu: %f $\n' % (min_loss, max_bleu)) # 关闭进程池等待开发集bleu测试完成 subprocess_pool.close() subprocess_pool.join()