encoder_model_file = 'encoder_rev.7.pt' decoder_model_file = 'decoder_rev.7.pt' encoder.load_state_dict(torch.load(encoder_model_file)) decoder.load_state_dict(torch.load(decoder_model_file)) ''' #Load Pre-trained Embedding model_file = 'bi_gru.100.100.2.pt' if model_file != '' : model.load_state_dict(torch.load(model_file)) else: model.load_pre_train_emb('cityu_training.char.emb.npy', 'cityu_training.char.dict', vocab) ''' loss_function = nn.NLLLoss(reduction = 'sum', ignore_index = de_vocab.item2index['_PAD_']) en_optimizer = optim.Adam(encoder.parameters(), lr = 1e-3, weight_decay = 0) de_optimizer = optim.Adam(decoder.parameters(), lr = 1e-3, weight_decay = 0) if use_cuda: encoder = encoder.cuda() decoder = decoder.cuda() ones_matrix = ones_matrix.cuda() loss_function = loss_function.cuda() for epoch in range(20): pl.reset() encoder.train() decoder.train() total_loss = torch.Tensor([0]) total_token = 0
def train(article, title, word2idx, target2idx, source_lengths, target_lengths, args, val_article=None, val_title=None, val_source_lengths=None, val_target_lengths=None): if not os.path.exists('./temp/x.pkl'): size_of_val = int(len(article) * 0.05) val_article, val_title, val_source_lengths, val_target_lengths = \ utils.sampling(article, title, source_lengths, target_lengths, size_of_val) utils.save_everything(article, title, source_lengths, target_lengths, val_article, val_title, val_source_lengths, val_target_lengths, word2idx) size_of_val = len(val_article) batch_size = args.batch train_size = len(article) val_size = len(val_article) max_a = max(source_lengths) max_t = max(target_lengths) print("source vocab size:", len(word2idx)) print("target vocab size:", len(target2idx)) print("max a:{}, max t:{}".format(max_a, max_t)) print("train_size:", train_size) print("val size:", val_size) print("batch_size:", batch_size) print("-" * 30) use_coverage = False encoder = Encoder(len(word2idx)) decoder = Decoder(len(target2idx), 50) if os.path.exists('decoder_model'): encoder.load_state_dict(torch.load('encoder_model')) decoder.load_state_dict(torch.load('decoder_model')) optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001) n_epoch = 5 print("Making word index and extend vocab") #article, article_tar, title, ext_vocab_all, ext_count = indexing_word(article, title, word2idx, target2idx) #article = to_tensor(article) #article_extend = to_tensor(article_extend) #title = to_tensor(title) print("preprocess done") if args.use_cuda: encoder.cuda() decoder.cuda() print("start training") for epoch in range(n_epoch): total_loss = 0 batch_n = int(train_size / batch_size) if epoch > 0: use_coverage = True for b in range(batch_n): # initialization batch_x = article[b * batch_size:(b + 1) * batch_size] batch_y = title[b * batch_size:(b + 1) * batch_size] #batch_x_ext = article_extend[b*batch_size: (b+1)*batch_size] batch_x, batch_x_ext, batch_y, extend_vocab, extend_lengths = \ utils.batch_index(batch_x, batch_y, word2idx, target2idx) if args.use_cuda: batch_x = batch_x.cuda() batch_y = batch_y.cuda() batch_x_ext = batch_x_ext.cuda() x_lengths = source_lengths[b * batch_size:(b + 1) * batch_size] y_lengths = target_lengths[b * batch_size:(b + 1) * batch_size] # work around to deal with length pack = pack_padded_sequence(batch_x_ext, x_lengths, batch_first=True) batch_x_ext_var, _ = pad_packed_sequence(pack, batch_first=True) current_loss = train_on_batch(encoder, decoder, optimizer, batch_x, batch_y, x_lengths, y_lengths, word2idx, target2idx, batch_x_ext_var, extend_lengths, use_coverage) batch_x = batch_x.cpu() batch_y = batch_y.cpu() batch_x_ext = batch_x_ext.cpu() print('epoch:{}/{}, batch:{}/{}, loss:{}'.format( epoch + 1, n_epoch, b + 1, batch_n, current_loss)) if (b + 1) % args.show_decode == 0: torch.save(encoder.state_dict(), 'encoder_model') torch.save(decoder.state_dict(), 'decoder_model') batch_x_val, batch_x_ext_val, batch_y_val, extend_vocab, extend_lengths = \ utils.batch_index(val_article, val_title, word2idx, target2idx) for i in range(1): idx = np.random.randint(0, val_size) decode.beam_search(encoder, decoder, batch_x_val[idx].unsqueeze(0), batch_y_val[idx].unsqueeze(0), word2idx, target2idx, batch_x_ext_val[idx], extend_lengths[idx], extend_vocab[idx]) batch_x_val = batch_x_val.cpu() batch_y_val = batch_y_val.cpu() batch_x_ext_val = batch_x_ext_val.cpu() total_loss += current_loss print('-' * 30) print() print("training finished")
l_trn_src = pickle.load(open('data/l_trn_src.pkl', 'rb')) trn_src_p = pickle.load(open('data/trn_src_p.pkl', 'rb')) l_trn_tgt = pickle.load(open('data/l_trn_tgt.pkl', 'rb')) trn_tgt_p = pickle.load(open('data/trn_tgt_p.pkl', 'rb')) tst_src_t = torch.LongTensor(tst_src_p) tst_tgt_t = torch.LongTensor(tst_tgt_p) trn_src_t = torch.LongTensor(trn_src_p) trn_tgt_t = torch.LongTensor(trn_tgt_p) enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad]) dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos], vocab[eos], vocab[unk]) enc.to('cuda') dec.to('cuda') opt_enc = torch.optim.Adam(enc.parameters()) opt_dec = torch.optim.Adam(dec.parameters()) n_batch = len(trn_src_p) // batch_size for e in range(epochs): enc.train() dec.train() epoch_loss = 0 for i in range(n_batch): opt_enc.zero_grad() opt_dec.zero_grad() lengths = torch.LongTensor(l_trn_src[batch_size * i:batch_size * (i + 1)]) out, h_n = enc(trn_src_t[batch_size * i:batch_size * (i + 1)], lengths) output = dec.teacher_force(