def train(datasets, mode): # optimizer, criterion, args, # JointModel.train() if args.use_RL: mini_batches = get_bags(datasets, relations, args.batchsize) noisy_sentences_vec = Variable( torch.FloatTensor(1, args.hidden_dim).fill_(0)) noisy_vec_mean = torch.mean(noisy_sentences_vec, 0, True) else: mini_batches = get_minibatches(datasets, args.batchsize) batchcnt = len(datasets[0]) // args.batchsize # len(list(mini_batches)) logger.info("********************%s data*********************" % mode) logger.info("number of batches: %s" % batchcnt) NER_correct, NER_total = 0., 0. RE_correct, RE_total = 0., 0. if mode != 'train': # NER_target_all, NER_output_all = None, None # RE_target_all, RE_output_all = None, None NER_target_all2, NER_output_all2 = [], [] RE_target_all2, RE_output_all2 = [], [] NER_output_logits, RE_output_logits = [], [] for b, data in enumerate(mini_batches): if b >= batchcnt: break sentences, pos_lambda, tags, sentences_words, relation_tags, relation_names = data input_tensor, input_length = padding_sequence(sentences, pad_token=0) pos_tensor, input_length = padding_sequence(pos_lambda, pad_token=0) target_tensor, target_length = padding_sequence( tags, pad_token=args.entity_tag_size) # entity tags relation_target_tensor = relation_tags # padding_sequence_recurr(relation_tags) # relation tag if torch.cuda.is_available(): input_tensor = Variable( torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable( torch.cuda.LongTensor(target_tensor, device=device)).cuda() if args.encoder_model == "BiLSTM": mask = torch.cuda.ByteTensor( (1 - (target_tensor == args.entity_tag_size))).to(device) else: mask = torch.cuda.ByteTensor( (1 - (input_tensor == 0))).to(device) pos_tensor = Variable( torch.cuda.FloatTensor(pos_tensor, device=device)).cuda() relation_target_tensor = Variable( torch.cuda.LongTensor(relation_target_tensor, device=device)).cuda() else: input_tensor = Variable( torch.LongTensor(input_tensor, device=device)) target_tensor = Variable( torch.LongTensor(target_tensor, device=device)) if args.encoder_model == "BiLSTM": mask = torch.ByteTensor( (1 - (target_tensor == args.entity_tag_size))).to(device) else: mask = torch.ByteTensor((1 - (input_tensor == 0))).to(device) pos_tensor = Variable(torch.Tensor(pos_tensor, device=device)) relation_target_tensor = Variable( torch.LongTensor(relation_target_tensor, device=device)) if mode == 'train': optimizer.zero_grad() NER_active_logits, NER_active_labels, RE_output_tag, NER_output_tag, NER_output, BERT_pooled_output = JointModel( input_tensor, pos_tensor, target_tensor, args.batchsize, mask) # , input_length, target_length if args.use_RL: mask_entity = [ list(map(lambda x: 1 if x in [1, 2, 4, 5] else 0, i)) for i in target_tensor ] if torch.cuda.is_available(): mask_entity = torch.cuda.ByteTensor(mask_entity).to(device) else: mask_entity = torch.ByteTensor(mask_entity).to(device) NER_embedding = None for i in range(len(mask_entity)): NER_embedding = torch.mean(NER_output[i][mask_entity[i]], 0).view(1, -1) if NER_embedding is None \ else torch.cat((NER_embedding, torch.mean(NER_output[i][mask_entity[i]], 0).view(1, -1)), 0) RE_rewards, loss_RL, noisy_sentences_vec, noisy_vec_mean = RL_model( BERT_pooled_output, NER_embedding, JointModel.noysy_model, RE_output_tag, relation_target_tensor, noisy_sentences_vec, noisy_vec_mean) if not args.use_RL: loss_entity = criterion(NER_active_logits, NER_active_labels) loss_RE = criterion(RE_output_tag, relation_target_tensor) loss = loss_entity + loss_RE if args.merge_loss: loss.backward() else: loss_entity.backward( retain_graph=True) # retain_graph=True loss_RE.backward(retain_graph=True) if args.use_RL: loss = loss_RL loss_RL.backward() ''' use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input for di in range(target_length): decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden) loss += criterion(decoder_output, target_tensor[di]) decoder_input = target_tensor[di] # Teacher forcing else: # Without teacher forcing: use its own predictions as the next input for di in range(target_length): decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden) topv, topi = decoder_output.topk(1) decoder_input = topi.squeeze().detach() # detach from history as input loss += criterion(decoder_output, target_tensor[di]) if decoder_input.item() == EOS_token: break ''' optimizer.step() else: NER_active_logits, NER_active_labels, RE_output_tag, NER_output_tag, _, _ = JointModel( input_tensor, pos_tensor, target_tensor, args.batchsize, mask, True) # , input_length, target_length NER_correct += (torch.argmax(NER_active_logits, -1) == NER_active_labels).sum().item() NER_total += len(NER_active_logits) # temp = 0. # for i in range(len(relation_target_tensor[0])): # target = torch.transpose(relation_target_tensor, 0, 1)[i] # temp += (torch.argmax(RE_output_tag, -1) == target).sum().item() RE_correct += (torch.argmax( RE_output_tag, -1) == relation_target_tensor).sum().item() RE_total += len(RE_output_tag) if mode != 'train': NER_target_all2.append(target_tensor.cpu().tolist( )) # target_tensor, NER_active_labels .numpy() NER_output_all2.append( torch.argmax( NER_output_tag, -1).cpu().tolist()) # NER_output_tag, NER_active_logits NER_output_logits.append(NER_output_tag.detach().cpu().tolist()) RE_output_all2.append( torch.argmax(RE_output_tag, -1).cpu().tolist()) RE_target_all2.append( relation_target_tensor.detach().cpu().tolist()) RE_output_logits.append(RE_output_tag.cpu().tolist()) if b % args.print_batch == 0: logger.info( 'seq-seq model: (%d %.2f%%), NER acc: %.4f, RE acc: %.4f' % (b, float(b) / batchcnt * 100, NER_correct / NER_total, RE_correct / RE_total)) '''if not args.do_train: if NER_target_all is None: NER_target_all = NER_active_labels.to('cpu') NER_output_all = NER_active_logits.to('cpu') else: NER_target_all = torch.cat((NER_target_all.to('cpu'), NER_active_labels.to('cpu')), dim=0) NER_output_all = torch.cat((NER_output_all.to('cpu'), NER_active_logits.to('cpu')), dim=0) if RE_target_all is None: RE_target_all = relation_target_tensor.to('cpu') RE_output_all = RE_output_tag.to('cpu') else: RE_target_all = torch.cat((RE_target_all.to('cpu'), relation_target_tensor.to('cpu')), dim=0) RE_output_all = torch.cat((RE_output_all.to('cpu'), RE_output_tag.to('cpu')), dim=0)''' if mode == 'train': out_losses.append(loss.item()) if b % args.print_batch == 0: logger.info( 'seq-seq model: (%d %.2f%%), loss_NER: %.4f, loss_RE: %.4f, NER acc: %.4f, RE acc: %.4f' % (b, float(b) / batchcnt * 100, loss_entity.item(), loss_RE.item(), NER_correct / NER_total, RE_correct / RE_total)) if mode != 'train': cal_F_score(RE_output_all2, RE_target_all2, NER_target_all2, NER_output_all2, args.batchsize) if args.do_train: if mode == 'test' or (mode == 'dev' and e == args.epochRL - 1): with open( args.output_dir + 'predict_%s_epoch_%s.json' % (mode, e), "a+") as fw: json.dump( { "RE_predict": RE_output_all2, "RE_actual": RE_target_all2, "RE_output_logits": RE_output_logits, "NER_predict": NER_output_all2, "NER_actual": NER_target_all2, "NER_output_logits": NER_output_logits }, fw) else: with open(args.output_dir + 'predict_%s.json' % mode, "a+") as fw: json.dump( { "RE_predict": RE_output_all2, "RE_actual": RE_target_all2, "RE_output_logits": RE_output_logits, "NER_predict": NER_output_all2, "NER_actual": NER_target_all2, "NER_output_logits": NER_output_logits }, fw) # np.save('pred_res/RE_predict', RE_output_all2) # RE_output_all.to('cpu').detach().numpy() # np.save('pred_res/RE_actual', RE_target_all2) # np.save('pred_res/NER_predict', NER_output_all2) # np.save('pred_res/NER_actual', NER_target_all2) '''NER_pred_res = metrics.classification_report(NER_target_all2, NER_output_all2) logger.info('NER Prediction results: \n{}'.format(NER_pred_res)) RE_pred_res = metrics.classification_report(RE_target_all2, RE_output_all2) logger.info('RE Prediction results: \n{}'.format(RE_pred_res))''' else: np.save(args.output_dir + "loss_train", out_losses)
print("training epoch ", e) # random.shuffle(train_data) # batchcnt = (len(train_data) - 1) // args.batchsize + 1 # for b in range(batchcnt): # # start = time.time() # datas = train_data[b * args.batchsize: (b + 1) * args.batchsize] mini_batches = get_minibatches(dev_datasets, args.batchsize) batchcnt = len( dev_datasets[0]) // args.batchsize # len(list(mini_batches)) for b, data in enumerate(mini_batches): if b >= batchcnt: break sentences, pos_lambda, tags, sentences_words, relation_tags, relation_names = data input_tensor, input_length = padding_sequence( sentences, pad_token=args.embedding_size) pos_tensor, input_length = padding_sequence(pos_lambda, pad_token=0) target_tensor, target_length = padding_sequence( tags, pad_token=args.entity_tag_size) relation_target_tensor = padding_sequence_recurr(relation_tags) if torch.cuda.is_available(): input_tensor = Variable( torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable( torch.cuda.LongTensor(target_tensor, device=device)).cuda() pos_tensor = Variable( torch.cuda.FloatTensor(pos_tensor, device=device)).cuda() relation_target_tensor = Variable( torch.cuda.LongTensor(relation_target_tensor,
def trainEpoches(encoder, decoder, criterion, print_every=10, learning_rate=0.001, l2=0.0001): start = time.time() out_losses = [] print_loss_total = 0 # Reset every print_every # plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=l2) # SGD decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=l2) # training_pairs = [tensorsFromPair(random.choice(pairs)) # for i in range(n_iters)] # for iter in range(1, n_iters + 1): # training_pair = training_pairs[iter - 1] # for epoch in range(epoches): # i = 0 mini_batches = get_minibatches(train_datasets, BATCH) batches_size = len(train_datasets[0]) // BATCH # len(list(mini_batches)) for i, data in enumerate(mini_batches): if i == batches_size: break # for i, data in enumerate(train_dataloader, 1): sentences, tags = data input_tensor, input_length = padding_sequence(sentences, pad_token=EMBEDDING_SIZE) target_tensor, target_length = padding_sequence(tags, pad_token=TAG_SIZE) if torch.cuda.is_available(): input_tensor = Variable( torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable( torch.cuda.LongTensor(target_tensor, device=device)).cuda() else: input_tensor = Variable( torch.LongTensor(input_tensor, device=device)) target_tensor = Variable( torch.LongTensor(target_tensor, device=device)) loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) # , input_length, target_length out_losses.append(loss) print_loss_total += loss # plot_loss_total += loss if i % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print(' (%d %d%%) %.4f' % (i, float(i) / batches_size * 100, print_loss_avg)) # print('%s (%d %d%%) %.4f' % (timeSince(start, float(i) / batches_size), # i, float(i) / batches_size * 100, print_loss_avg)) # plot_loss_avg = plot_loss_total / plot_every # plot_losses.append(plot_loss_avg) # plot_loss_total = 0 # i += 1 np.save("loss", out_losses) if epoch % 10 == 0: model_name = "./model/model_encoder_epoch" + str(epoch) + ".pkl" torch.save(encoder, model_name) model_name = "./model/model_decoder_epoch" + str(epoch) + ".pkl" torch.save(decoder, model_name) print("Model has been saved")