def main(test_data_path): dic = pickle.load(open('vocab.pkl','rb')) word_vocab = dic['word_vocab'] char_vocab = dic['char_vocab'] max_len = dic['max_len'] batch_size = config.batch_size embed_dim = config.embed_dim out_channels = config.out_channels kernels = config.kernels hidden_size = config.hidden_size learning_rate = config.learning_rate seq_len = config.seq_len test_data, _ = corpus_to_word(test_data_path, batch_size) test_idx = word_to_idx(test_data,word_vocab) test_idx = test_idx.contiguous().view(batch_size, -1) test_data = word_to_char(test_data, char_vocab, max_len) test_data = torch.from_numpy(test_data) test_data = test_data.contiguous().view(batch_size, -1, max_len) model = LM(word_vocab,char_vocab,max_len,embed_dim,out_channels,kernels,hidden_size) if torch.cuda.is_available(): model.cuda() model.load_state_dict(torch.load('model.pkl')) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',factor=0.5,patience=1,verbose=True) hidden_state = (Variable(torch.zeros(2,batch_size,hidden_size).cuda(), volatile=False), Variable(torch.zeros(2,batch_size,hidden_size).cuda(), volatile=False)) model.eval() test_loss = eval(seq_len,test_data,test_idx,model,hidden_state, criterion) test_loss = np.exp(test_loss)
def try_params(n_iterations, params): n_iterations = int(n_iterations) use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_vocab = dill.load(open('src_vocab.pickle', 'rb')) trg_vocab = dill.load(open('trg_vocab.pickle', 'rb')) src_dev = dill.load(open('src_dev.pickle', 'rb')) trg_dev = dill.load(open('trg_dev.pickle', 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = [] if options.contain_bilingual: print('Load') src_train = dill.load(open('src_sents1.pickle', 'rb')) print('Load src sents 1') trg_train = dill.load(open('trg_sents1.pickle', 'rb')) print('Load trg sents 1') batched_train_src1, batched_train_src_mask1, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg1, batched_train_trg_mask1 = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = batches + [(1, i) for i in range(len(batched_train_src1))] if options.mono_loss: batches = batches + [(4, i) for i in range(len(batched_train_src1))] batches = batches + [(5, i) for i in range(len(batched_train_src1))] if options.contain_trg: print('Load') # src_train = dill.load(open('src_sents2.pickle', 'rb')) # print('Load src sents 2') trg_train = dill.load(open('trg_sents2.pickle', 'rb')) print('Load trg sents 2') # batched_train_src2, batched_train_src_mask2, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg2, batched_train_trg_mask2 = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = batches + [(2, i) for i in range(len(batched_train_trg2))] if options.contain_src: print('Load') src_train = dill.load(open('src_sents3.pickle', 'rb')) print('Load src sents 3') # trg_train = dill.load(open('trg_sents3.pickle', 'rb')) # print('Load trg sents 3') batched_train_src3, batched_train_src_mask3, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) # batched_train_trg3, batched_train_trg_mask3 = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batches = batches + [(3, i) for i in range(len(batched_train_src3))] src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) if os.path.isfile(options.load_file_src) and os.path.isfile( options.load_file_trg): src_lm = torch.load(open(options.load_file_src, 'rb')) trg_lm = torch.load(open(options.load_file_trg, 'rb')) else: src_lm = LM(src_vocab_size, src_vocab.stoi['<s>'], src_vocab.stoi['</s>'], params['embedding_size'], params['hidden_size'], params['dropout'], use_cuda) trg_lm = LM(trg_vocab_size, trg_vocab.stoi['<s>'], trg_vocab.stoi['</s>'], params['embedding_size'], params['hidden_size'], params['dropout'], use_cuda) if use_cuda > 0: src_lm.cuda() trg_lm.cuda() else: src_lm.cpu() trg_lm.cpu() criterion = torch.nn.NLLLoss() optimizer_src = eval("torch.optim." + options.optimizer)( src_lm.parameters(), params['learning_rate']) optimizer_trg = eval("torch.optim." + options.optimizer)( trg_lm.parameters(), params['learning_rate']) # main training loop # last_dev_avg_loss = float("inf") for epoch_i in range(n_iterations): print(epoch_i) logging.info("At {0}-th epoch.".format(epoch_i)) shuffle(batches) src_lm.train() trg_lm.train() for i, (index, batch_i) in enumerate(batches): train_src_batch = None train_src_mask = None train_trg_batch = None train_trg_mask = None if index == 1: train_src_batch = Variable(batched_train_src1[batch_i]) train_src_mask = Variable(batched_train_src_mask1[batch_i]) train_trg_batch = Variable(batched_train_trg1[batch_i]) train_trg_mask = Variable(batched_train_trg_mask1[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() elif index == 2: train_trg_batch = Variable(batched_train_trg2[batch_i]) train_trg_mask = Variable(batched_train_trg_mask2[batch_i]) if use_cuda: train_trg_batch = train_trg_batch.cuda() train_trg_mask = train_trg_mask.cuda() elif index == 3: train_src_batch = Variable(batched_train_src3[batch_i]) train_src_mask = Variable(batched_train_src_mask3[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_src_mask = train_src_mask.cuda() elif index == 4: train_src_batch = Variable(batched_train_src1[batch_i]) train_src_mask = Variable(batched_train_src_mask1[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_src_mask = train_src_mask.cuda() elif index == 5: train_trg_batch = Variable(batched_train_trg1[batch_i]) train_trg_mask = Variable(batched_train_trg_mask1[batch_i]) if use_cuda: train_trg_batch = train_trg_batch.cuda() train_trg_mask = train_trg_mask.cuda() else: raise ValueError() total_loss = 0 if index == 1: optimizer_trg.zero_grad() optimizer_src.zero_grad() h_src, c_src = src_lm(sent=train_src_batch) use_teacher_forcing = True if random.random( ) < params['teacher_forcing_ratio'] else False sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=train_trg_batch, teacher_forcing=use_teacher_forcing) train_trg_mask_tmp = train_trg_mask.view(-1) train_trg_batch_tmp = train_trg_batch.view(-1) train_trg_batch_tmp = train_trg_batch_tmp.masked_select( train_trg_mask_tmp) train_trg_mask_tmp = train_trg_mask_tmp.unsqueeze(1).expand( len(train_trg_mask_tmp), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask_tmp).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch_tmp) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format( i, loss.data[0])) elif options.mono_loss and train_src_batch is not None: optimizer_trg.zero_grad() optimizer_src.zero_grad() h_src, c_src = src_lm(sent=train_src_batch) use_teacher_forcing = True if random.random( ) < params['teacher_forcing_ratio'] else False sys_out_batch = src_lm(h=h_src, c=c_src, encode=False, tgt_sent=train_src_batch, teacher_forcing=use_teacher_forcing) train_src_mask_tmp = train_src_mask.view(-1) train_src_batch_tmp = train_src_batch.view(-1) train_src_batch_tmp = train_src_batch_tmp.masked_select( train_src_mask_tmp) train_src_mask_tmp = train_src_mask_tmp.unsqueeze(1).expand( len(train_src_mask_tmp), src_vocab_size) sys_out_batch = sys_out_batch.view(-1, src_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_src_mask_tmp).view(-1, src_vocab_size) loss = criterion(sys_out_batch, train_src_batch_tmp) loss *= params['mono_loss_multi'] * (1.0 / 10 * epoch_i) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format( i, loss.data[0])) elif train_trg_batch is not None and options.mono_loss: optimizer_trg.zero_grad() optimizer_src.zero_grad() h_trg, c_trg = trg_lm(sent=train_trg_batch) use_teacher_forcing = True if random.random( ) < params['teacher_forcing_ratio'] else False sys_out_batch = trg_lm(h=h_trg, c=c_trg, encode=False, tgt_sent=train_trg_batch, teacher_forcing=use_teacher_forcing) train_trg_mask_tmp = train_trg_mask.view(-1) train_trg_batch_tmp = train_trg_batch.view(-1) train_trg_batch_tmp = train_trg_batch_tmp.masked_select( train_trg_mask_tmp) train_trg_mask_tmp = train_trg_mask_tmp.unsqueeze(1).expand( len(train_trg_mask_tmp), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask_tmp).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch_tmp) loss *= params['mono_loss_multi'] * (1.0 / 10 * epoch_i) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format( i, loss.data[0])) # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 src_lm.eval() trg_lm.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() h_src, c_src = src_lm(sent=dev_src_batch) sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=dev_trg_batch) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break # torch.save(src_lm, open(options.model_file_src + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) # torch.save(trg_lm, open(options.model_file_trg + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) # last_dev_avg_loss = dev_avg_loss return {'loss': dev_avg_loss.data[0]}
def train_(self): cur_best = 10000 model = LM(self.unique_words, self.char_vocab, self.max_len, self.embed_dim, self.channels, self.kernels, self.hidden_size) if torch.cuda.is_available(): model.cuda() learning_rate = self.learning_rate criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for epoch in range(self.epochs): model.train(True) hidden_state = [torch.zeros(2, self.batch_size, self.hidden_size).cuda()] * 2 ######## for i in range(0, self.train.size(1)-self.seq_len, self.seq_len): model.zero_grad() inputs = self.train[:, i : i + self.seq_len,:].cuda() # 20 * 35 * 21 targets = self.train_idx[:, (i+1) : (i+1) + self.seq_len].cuda() # 20 * 35 temp = [] for state in hidden_state: temp.append(state.detach()) hidden_state = temp output, hidden_state = model(inputs, hidden_state) # initialize? loss = criterion(output, targets.view(-1)) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 5) # clipping optimizer.step() step = (i+1) // self.seq_len if step % 100 == 0: print ('Epoch %d/%d, Batch x Seq_Len %d/%d, Loss: %.3f, Perplexity: %5.2f' % (epoch, self.epochs, step, self.num_batches//self.seq_len, loss.item(), np.exp(loss.item()))) model.eval() val_loss = self._validate(self.seq_len, self.valid, self.valid_idx, model, hidden_state, criterion) val_perplex = np.exp(val_loss) if cur_best-val_perplex < 1 : # pivot? if learning_rate > 0.03: learning_rate = learning_rate * 0.5 print("Adjusted learning_rate : %.5f"%learning_rate) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) else: pass if val_perplex < cur_best: print("The current best val loss: ", val_loss) cur_best = val_perplex torch.save(model.state_dict(), 'model.pkl')
loss = criterion(output, targets.view(-1)) loss.backward() nn.utils.clip_grad_norm(model.parameters(), 5) optimizer.step() step = (i + 1) // seq_len if step % 100 == 0: print( 'Epoch [%d/%d], Step[%d/%d], Loss: %.3f, Perplexity: %5.2f' % (epoch + 1, num_epochs, step, num_batches // seq_len, loss.data[0], np.exp(loss.data[0]))) model.eval() #validate val_loss = validate(seq_len, val_data, val_label, model, hidden_state) val_loss = np.exp(val_loss) if pivot - val_loss < 0.8: if learning_rate > 0.03: learning_rate = learning_rate * 0.5 print(learning_rate) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) pivot = val_loss if val_loss < best_ppl:
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_vocab = dill.load(open('src_vocab.pickle', 'rb')) trg_vocab = dill.load(open('trg_vocab.pickle', 'rb')) src_dev = dill.load(open('src_dev.pickle', 'rb')) trg_dev = dill.load(open('trg_dev.pickle', 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) # batches = [] # if options.contain_bilingual: print('Load') src_train = dill.load(open('src_sents1.pickle', 'rb')) print('Load src sents 1') trg_train = dill.load(open('trg_sents1.pickle', 'rb')) print('Load trg sents 1') src_train = src_train + dill.load(open('src_sents2.pickle', 'rb')) print('Load src sents 2') trg_train = trg_train + dill.load(open('trg_sents2.pickle', 'rb')) print('Load trg sents 2') src_train = src_train + dill.load(open('src_sents3.pickle', 'rb')) print('Load src sents 3') trg_train = trg_train + dill.load(open('trg_sents3.pickle', 'rb')) print('Load trg sents 3') batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) if os.path.isfile(options.load_file_src) and os.path.isfile( options.load_file_trg): src_lm = torch.load(open(options.load_file_src, 'rb')) trg_lm = torch.load(open(options.load_file_trg, 'rb')) else: src_lm = LM(src_vocab_size, src_vocab.stoi['<s>'], src_vocab.stoi['</s>'], options.embedding_size, options.hidden_size, options.dropout, use_cuda) trg_lm = LM(trg_vocab_size, trg_vocab.stoi['<s>'], trg_vocab.stoi['</s>'], options.embedding_size, options.hidden_size, options.dropout, use_cuda) if use_cuda > 0: src_lm.cuda() trg_lm.cuda() else: src_lm.cpu() trg_lm.cpu() criterion = torch.nn.NLLLoss() optimizer_src = eval("torch.optim." + options.optimizer)( src_lm.parameters(), options.learning_rate) optimizer_trg = eval("torch.optim." + options.optimizer)( trg_lm.parameters(), options.learning_rate) # main training loop # last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): print(epoch_i) logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range src_lm.train() trg_lm.train() for i, batch_i in enumerate(range(len(batched_train_src))): optimizer_trg.zero_grad() optimizer_src.zero_grad() train_src_batch = Variable(batched_train_src[batch_i]) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_batch = Variable(batched_train_trg[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() h_src, c_src = src_lm(sent=train_src_batch) use_teacher_forcing = True if random.random( ) < options.teacher_forcing_ratio else False sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=train_trg_batch, teacher_forcing=use_teacher_forcing) train_trg_mask_tmp = train_trg_mask.view(-1) train_trg_batch_tmp = train_trg_batch.view(-1) train_trg_batch_tmp = train_trg_batch_tmp.masked_select( train_trg_mask_tmp) train_trg_mask_tmp = train_trg_mask_tmp.unsqueeze(1).expand( len(train_trg_mask_tmp), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask_tmp).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch_tmp) loss.backward() optimizer_src.step() optimizer_trg.step() if i % 100 == 0: logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 src_lm.eval() trg_lm.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() h_src, c_src = src_lm(sent=dev_src_batch) sys_out_batch = trg_lm(h=h_src, c=c_src, encode=False, tgt_sent=dev_trg_batch) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save( src_lm, open( options.model_file_src + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) torch.save( trg_lm, open( options.model_file_trg + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill)