def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. @param model (NMT): NMT Model @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set. @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) @param max_decoding_time_step (int): maximum sentence length that Beam search can produce @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. """ was_training = model.training model.eval() hypotheses = [] with torch.no_grad(): for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): example_hyps = model.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) if was_training: model.train(was_training) return hypotheses
def sample(args): train_data_src = read_corpus(args.src_file, source='src') train_data_tgt = read_corpus(args.tgt_file, source='tgt') train_data = zip(train_data_src, train_data_tgt) # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.eval() model = model.cuda() # sampling print('begin sampling') train_iter = cum_samples = 0 for src_sents, tgt_sents in data_iter(train_data, batch_size=1): train_iter += 1 samples = model.sample(src_sents, sample_size=5, to_word=True) cum_samples += sum(len(sample) for sample in samples) for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80)
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) _, src_dev, _, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb')) _, trg_dev, _, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) trg_vocab_size = len(trg_vocab) print(trg_vocab.itos[4]) original_model = torch.load(open(options.original_model_file, 'rb')) nmt = NMT(original_model) # TODO: add more arguments as necessary nmt.eval() if use_cuda > 0: nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() # optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) total_loss = 0 num_sents = 0 for i, batch_i in enumerate(utils.rand.srange(len(batched_dev_src))): print("{0}/ {1}".format(i, len(batched_dev_src))) dev_src_batch = Variable(batched_dev_src[batch_i]) # of size (src_seq_len, batch_size) dev_trg_batch = Variable(batched_dev_trg[batch_i]) # of size (src_seq_len, batch_size) dev_src_mask = Variable(batched_dev_src_mask[batch_i]) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i]) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() num_sents += 1 sys_out_batch = nmt(dev_src_batch, dev_trg_batch) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) # _, max = torch.max(sys_out_batch,dim=1) # print(sys_out_batch[dev_trg_batch]) # print(max, dev_trg_batch) total_loss += loss # break print(total_loss, num_sents) print(total_loss/num_sents) print(torch.exp(total_loss/num_sents))
def main(options): _, _, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) _, _, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) nmt = NMT(src_vocab_size, trg_vocab_size) nmt = torch.load(open(options.modelname, 'rb')) nmt.eval() if torch.cuda.is_available(): nmt.cuda() else: nmt.cpu() with open('data/output_tanay.txt', 'w') as f_write: for i in range(len(src_test)): src = to_var(torch.unsqueeze(src_test[i], 1), volatile=True) trg = to_var(torch.unsqueeze(trg_test[i], 1), volatile=True) results = nmt(src, trg) s = "" for ix in results: idx = np.argmax(ix.data.cpu().numpy()) if idx == 2: # if <s>, don't write it continue if idx == 3: # if </s>, end the loop break s += trg_vocab.itos[idx] + " " #print s.encode('utf-8') #if len(s): s += '\n' s += '\n' f_write.write(s.encode('utf-8'))
if avg_loss < best_loss: best_loss = avg_loss count = 0 else: count += 1 if count == 5: lr = optimizer.param_groups[0]['lr'] * lr_decay print("{Learning rate decay to %f}" % (lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr if lr < 1e-6: break count = 0 # training with torch.no_grad(): model.eval() sum_loss = 0 batch_num = math.ceil(dev_len / batch_size) for step in range(batch_num): inputs = batch_iter(dev_x, step, batch_size) labels = batch_iter(dev_y, step, batch_size) masks = batch_iter(dev_mask, step, batch_size) inputs = torch.LongTensor(inputs).to(device) labels = torch.LongTensor(labels).to(device) masks = torch.ByteTensor(masks).to(device) outputs = model(inputs, labels) Loss = Loss_fn(outputs, labels, masks) sum_loss += Loss if step % 100 == 0:
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) trg_vocab_size = len(trg_vocab) src_vocab_size = len(src_vocab) word_emb_size = 300 hidden_size = 1024 nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size, src_vocab, trg_vocab, attn_model = "general", use_cuda = True) if use_cuda > 0: nmt.cuda() if options.distributed: nmt = torch.nn.DataParallel(nmt) else: nmt.cpu() criterion = torch.nn.NLLLoss() # Configure optimization lr = options.learning_rate optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), lr) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # Set training mode nmt.train() # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): train_src_batch = Variable(batched_train_src[batch_i]) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i]) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() sys_out_batch = nmt(train_src_batch, train_trg_batch, True) del train_src_batch train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() # # gradient clipping torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0) optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 # Set validation mode nmt.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info("Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save(nmt, open(options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss