Exemple #1
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
def sample(args):
    train_data_src = read_corpus(args.src_file, source='src')
    train_data_tgt = read_corpus(args.tgt_file, source='tgt')
    train_data = zip(train_data_src, train_data_tgt)

    # load model params
    print('load model from [%s]' % args.model_bin, file=sys.stderr)
    params = torch.load(args.model_bin,
                        map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    opt = params['args']
    state_dict = params['state_dict']

    # build model
    model = NMT(opt, vocab)
    model.load_state_dict(state_dict)
    model.eval()
    model = model.cuda()

    # sampling
    print('begin sampling')
    train_iter = cum_samples = 0
    for src_sents, tgt_sents in data_iter(train_data, batch_size=1):
        train_iter += 1
        samples = model.sample(src_sents, sample_size=5, to_word=True)
        cum_samples += sum(len(sample) for sample in samples)

        for i, tgt_sent in enumerate(tgt_sents):
            print('*' * 80)
            print('target:' + ' '.join(tgt_sent))
            tgt_samples = samples[i]
            print('samples:')
            for sid, sample in enumerate(tgt_samples, 1):
                print('[%d] %s' % (sid, ' '.join(sample[1:-1])))
            print('*' * 80)
Exemple #3
0
def main(options):

  use_cuda = (len(options.gpuid) >= 1)
  if options.gpuid:
    cuda.set_device(options.gpuid[0])

  _, src_dev, _, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb'))
  _, trg_dev, _, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb'))

  batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"])
  batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

  trg_vocab_size = len(trg_vocab)
  print(trg_vocab.itos[4])

  original_model = torch.load(open(options.original_model_file, 'rb'))
  nmt = NMT(original_model) # TODO: add more arguments as necessary 
  nmt.eval()
  if use_cuda > 0:
    nmt.cuda()
  else:
    nmt.cpu()

  criterion = torch.nn.NLLLoss()
  # optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate)

  total_loss = 0
  num_sents = 0
  for i, batch_i in enumerate(utils.rand.srange(len(batched_dev_src))):
    print("{0}/ {1}".format(i, len(batched_dev_src)))
    dev_src_batch = Variable(batched_dev_src[batch_i])  # of size (src_seq_len, batch_size)
    dev_trg_batch = Variable(batched_dev_trg[batch_i])  # of size (src_seq_len, batch_size)
    dev_src_mask = Variable(batched_dev_src_mask[batch_i])
    dev_trg_mask = Variable(batched_dev_trg_mask[batch_i])
    if use_cuda:
      dev_src_batch = dev_src_batch.cuda()
      dev_trg_batch = dev_trg_batch.cuda()
      dev_src_mask = dev_src_mask.cuda()
      dev_trg_mask = dev_trg_mask.cuda()
    num_sents += 1

    sys_out_batch = nmt(dev_src_batch, dev_trg_batch)  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary 
    dev_trg_mask = dev_trg_mask.view(-1)
    dev_trg_batch = dev_trg_batch.view(-1)
    dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
    dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size)
    sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
    sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size)
    loss = criterion(sys_out_batch, dev_trg_batch)
    # _, max = torch.max(sys_out_batch,dim=1)
    # print(sys_out_batch[dev_trg_batch])
    # print(max, dev_trg_batch)
    total_loss += loss
    # break
    print(total_loss, num_sents)
    print(total_loss/num_sents)
    print(torch.exp(total_loss/num_sents))
Exemple #4
0
def main(options):

    _, _, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    _, _, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))

    src_vocab_size = len(src_vocab)
    trg_vocab_size = len(trg_vocab)

    nmt = NMT(src_vocab_size, trg_vocab_size)
    nmt = torch.load(open(options.modelname, 'rb'))
    nmt.eval()

    if torch.cuda.is_available():
        nmt.cuda()
    else:
        nmt.cpu()

    with open('data/output_tanay.txt', 'w') as f_write:
        for i in range(len(src_test)):
            src = to_var(torch.unsqueeze(src_test[i], 1), volatile=True)
            trg = to_var(torch.unsqueeze(trg_test[i], 1), volatile=True)

            results = nmt(src, trg)
            s = ""
            for ix in results:
                idx = np.argmax(ix.data.cpu().numpy())

                if idx == 2:  # if <s>, don't write it
                    continue
                if idx == 3:  # if </s>, end the loop
                    break
                s += trg_vocab.itos[idx] + " "

            #print s.encode('utf-8')
            #if len(s): s += '\n'
            s += '\n'
            f_write.write(s.encode('utf-8'))
Exemple #5
0
        if avg_loss < best_loss:
            best_loss = avg_loss
            count = 0
        else:
            count += 1
        if count == 5:
            lr = optimizer.param_groups[0]['lr'] * lr_decay
            print("{Learning rate decay to %f}" % (lr))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            if lr < 1e-6: break
            count = 0
        # training

        with torch.no_grad():
            model.eval()
            sum_loss = 0
            batch_num = math.ceil(dev_len / batch_size)
            for step in range(batch_num):
                inputs = batch_iter(dev_x, step, batch_size)
                labels = batch_iter(dev_y, step, batch_size)
                masks = batch_iter(dev_mask, step, batch_size)
                inputs = torch.LongTensor(inputs).to(device)
                labels = torch.LongTensor(labels).to(device)
                masks = torch.ByteTensor(masks).to(device)

                outputs = model(inputs, labels)
                Loss = Loss_fn(outputs, labels, masks)
                sum_loss += Loss

                if step % 100 == 0:
Exemple #6
0
def main(options):

  use_cuda = (len(options.gpuid) >= 1)
  if options.gpuid:
    cuda.set_device(options.gpuid[0])

  src_train, src_dev, src_test, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb'))
  trg_train, trg_dev, trg_test, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb'))

  batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"])
  batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)
  batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"])
  batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

  trg_vocab_size = len(trg_vocab)
  src_vocab_size = len(src_vocab)
  word_emb_size = 300
  hidden_size = 1024

  nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size,
            src_vocab, trg_vocab, attn_model = "general", use_cuda = True)

  if use_cuda > 0:
    nmt.cuda()
    if options.distributed:
      nmt = torch.nn.DataParallel(nmt)
  else:
    nmt.cpu()

  criterion = torch.nn.NLLLoss()

  # Configure optimization
  lr = options.learning_rate
  optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), lr)

  
  # main training loop
  last_dev_avg_loss = float("inf")
  for epoch_i in range(options.epochs):
    logging.info("At {0}-th epoch.".format(epoch_i))

    # Set training mode
    nmt.train()

    # srange generates a lazy sequence of shuffled range
    for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))):
      train_src_batch = Variable(batched_train_src[batch_i])  # of size (src_seq_len, batch_size)
      train_trg_batch = Variable(batched_train_trg[batch_i])  # of size (src_seq_len, batch_size)
      train_src_mask = Variable(batched_train_src_mask[batch_i])
      train_trg_mask = Variable(batched_train_trg_mask[batch_i])
      if use_cuda:
        train_src_batch = train_src_batch.cuda()
        train_trg_batch = train_trg_batch.cuda()
        train_src_mask = train_src_mask.cuda()
        train_trg_mask = train_trg_mask.cuda()

      sys_out_batch = nmt(train_src_batch, train_trg_batch, True)

      del train_src_batch

      train_trg_mask = train_trg_mask.view(-1)
      train_trg_batch = train_trg_batch.view(-1)
      train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
      train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size)
      sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
      sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size)
      loss = criterion(sys_out_batch, train_trg_batch)
      logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))
      
      optimizer.zero_grad()
      loss.backward()
      # # gradient clipping
      torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0)
      optimizer.step()

    # validation -- this is a crude esitmation because there might be some paddings at the end
    dev_loss = 0.0

    # Set validation mode
    nmt.eval()

    for batch_i in range(len(batched_dev_src)):
      dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True)
      dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True)
      dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True)
      dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True)
      if use_cuda:
        dev_src_batch = dev_src_batch.cuda()
        dev_trg_batch = dev_trg_batch.cuda()
        dev_src_mask = dev_src_mask.cuda()
        dev_trg_mask = dev_trg_mask.cuda()

      sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False)

      dev_trg_mask = dev_trg_mask.view(-1)
      dev_trg_batch = dev_trg_batch.view(-1)
      dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
      dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size)
      sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
      sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size)
      loss = criterion(sys_out_batch, dev_trg_batch)
      logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0]))
      dev_loss += loss
    dev_avg_loss = dev_loss / len(batched_dev_src)
    logging.info("Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss.data[0], epoch_i))

    # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
    #   logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0]))
    #   break
    torch.save(nmt, open(options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill)
    last_dev_avg_loss = dev_avg_loss