Esempio n. 1
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
Esempio n. 2
0
 def __init__(self, _hparams):
     self.hparams = _hparams
     set_seed(_hparams.fixed_seed)
     self.train_loader = get_dataloader(_hparams.train_src_path, _hparams.train_dst_path,
                                        _hparams.batch_size, _hparams.num_workers)
     self.src_vocab, self.dst_vocab = load_vocab(_hparams.train_src_pkl, _hparams.train_dst_pkl)
     self.device = torch.device(_hparams.device)
     self.model = NMT(_hparams.embed_size, _hparams.hidden_size,
                      self.src_vocab, self.dst_vocab, self.device,
                      _hparams.dropout_rate).to(self.device)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=_hparams.lr)
def sample(args):
    train_data_src = read_corpus(args.src_file, source='src')
    train_data_tgt = read_corpus(args.tgt_file, source='tgt')
    train_data = zip(train_data_src, train_data_tgt)

    # load model params
    print('load model from [%s]' % args.model_bin, file=sys.stderr)
    params = torch.load(args.model_bin,
                        map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    opt = params['args']
    state_dict = params['state_dict']

    # build model
    model = NMT(opt, vocab)
    model.load_state_dict(state_dict)
    model.eval()
    model = model.cuda()

    # sampling
    print('begin sampling')
    train_iter = cum_samples = 0
    for src_sents, tgt_sents in data_iter(train_data, batch_size=1):
        train_iter += 1
        samples = model.sample(src_sents, sample_size=5, to_word=True)
        cum_samples += sum(len(sample) for sample in samples)

        for i, tgt_sent in enumerate(tgt_sents):
            print('*' * 80)
            print('target:' + ' '.join(tgt_sent))
            tgt_samples = samples[i]
            print('samples:')
            for sid, sample in enumerate(tgt_samples, 1):
                print('[%d] %s' % (sid, ' '.join(sample[1:-1])))
            print('*' * 80)
Esempio n. 4
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')
    else:
        test_data_tgt = None

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    beam_size = int(args['--beam-size'])
    max_decoding_time_step = int(args['--max-decoding-time-step'])
    output_file = args['OUTPUT_FILE']

    decode_with_params(model, test_data_src, test_data_tgt, beam_size,
                       max_decoding_time_step, output_file)
Esempio n. 5
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
Esempio n. 6
0
def main(options):

  use_cuda = (len(options.gpuid) >= 1)
  if options.gpuid:
    cuda.set_device(options.gpuid[0])

  _, src_dev, _, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb'))
  _, trg_dev, _, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb'))

  batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"])
  batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

  trg_vocab_size = len(trg_vocab)
  print(trg_vocab.itos[4])

  original_model = torch.load(open(options.original_model_file, 'rb'))
  nmt = NMT(original_model) # TODO: add more arguments as necessary 
  nmt.eval()
  if use_cuda > 0:
    nmt.cuda()
  else:
    nmt.cpu()

  criterion = torch.nn.NLLLoss()
  # optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate)

  total_loss = 0
  num_sents = 0
  for i, batch_i in enumerate(utils.rand.srange(len(batched_dev_src))):
    print("{0}/ {1}".format(i, len(batched_dev_src)))
    dev_src_batch = Variable(batched_dev_src[batch_i])  # of size (src_seq_len, batch_size)
    dev_trg_batch = Variable(batched_dev_trg[batch_i])  # of size (src_seq_len, batch_size)
    dev_src_mask = Variable(batched_dev_src_mask[batch_i])
    dev_trg_mask = Variable(batched_dev_trg_mask[batch_i])
    if use_cuda:
      dev_src_batch = dev_src_batch.cuda()
      dev_trg_batch = dev_trg_batch.cuda()
      dev_src_mask = dev_src_mask.cuda()
      dev_trg_mask = dev_trg_mask.cuda()
    num_sents += 1

    sys_out_batch = nmt(dev_src_batch, dev_trg_batch)  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary 
    dev_trg_mask = dev_trg_mask.view(-1)
    dev_trg_batch = dev_trg_batch.view(-1)
    dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
    dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size)
    sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
    sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size)
    loss = criterion(sys_out_batch, dev_trg_batch)
    # _, max = torch.max(sys_out_batch,dim=1)
    # print(sys_out_batch[dev_trg_batch])
    # print(max, dev_trg_batch)
    total_loss += loss
    # break
    print(total_loss, num_sents)
    print(total_loss/num_sents)
    print(torch.exp(total_loss/num_sents))
Esempio n. 7
0
    def test(self):
        print('*' * 20, 'start test', '*' * 20)
        self.model = NMT.load(self.hparams.model_save_path, self.device)
        sources, references, hypotheses = self.beam_search()
        bleu_score = compute_corpus_level_bleu_score(references, hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100))

        with open(self.hparams.test_res_path, 'w') as f:
            for src_sent, hypo in zip(sources, hypotheses):
                src_sent = ' '.join(src_sent)
                hypo_sent = ' '.join(hypo.value)
                f.write(src_sent + '\n' + hypo_sent + '\n\n')
        print('save test result to {}'.format(self.hparams.test_res_path))
        print('*' * 20, 'end test', '*' * 20)
Esempio n. 8
0
def create_model(sess,
                 args,
                 src_vocab_size,
                 tgt_vocab_size,
                 src_vocab_rev,
                 tgt_vocab_rev,
                 mode=constants.TRAIN,
                 reuse=None,
                 load_pretrained_model=False,
                 direction="",
                 model_save_dir=None):
    sess.run(tf.tables_initializer())

    with tf.variable_scope(constants.NMT_VAR_SCOPE + direction, reuse=reuse):
        with tf.variable_scope("src"):
            src_emb = tf.get_variable("embedding",
                                      shape=[src_vocab_size, args.emb_dim])
        with tf.variable_scope("dst"):
            tgt_emb = tf.get_variable("embedding",
                                      shape=[tgt_vocab_size, args.emb_dim])

        model = NMT(mode, args.__dict__, src_vocab_size, tgt_vocab_size,
                    src_emb, tgt_emb, src_vocab_rev, tgt_vocab_rev, direction)

    if load_pretrained_model:
        if model_save_dir is None:
            model_save_dir = args.nmt_model_save_dir
            if direction not in model_save_dir:
                if direction[::-1] in model_save_dir:
                    model_save_dir = re.sub(direction[::-1], direction,
                                            model_save_dir)
                else:
                    model_save_dir = os.path.join(model_save_dir, direction)
        print(model_save_dir)
        try:
            print("Loading nmt model from", model_save_dir)
            model.saver.restore(sess, model_save_dir)
        except Exception as e:
            print("Error! Loading nmt model from", model_save_dir)
            print("Again! Loading nmt model from",
                  tf.train.latest_checkpoint(model_save_dir))
            model.saver.restore(sess,
                                tf.train.latest_checkpoint(model_save_dir))
    else:
        if reuse is None:
            print("Creating model with new parameters.")
            sess.run(tf.global_variables_initializer())
        else:
            print("Reuse parameters.")
    return model
def beam(args):
    # load model params
    print('load model from [%s]' % args.model_bin, file=sys.stderr)
    params = torch.load(args.model_bin,
                        map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    opt = params['args']
    state_dict = params['state_dict']

    # build model
    model = NMT(opt, vocab)
    model.load_state_dict(state_dict)
    model.train()
    # model.eval()
    model = model.cuda()

    # loss function
    loss_fn = torch.nn.NLLLoss()

    # sampling
    print('begin beam searching')
    src_sent = ['we', 'have', 'told', 'that', '.']
    hyps = model.beam(src_sent)

    print('src_sent:', ' '.join(src_sent))
    for ids, hyp, dist in hyps:
        print('tgt_sent:', ' '.join(hyp))
        print('tgt_ids :', end=' ')
        for id in ids:
            print(id, end=', ')
        print()
        print('out_dist:', dist)

        var_ids = torch.autograd.Variable(torch.LongTensor(ids[1:]),
                                          requires_grad=False)
        loss = loss_fn(dist, var_ids)
        print('NLL loss =', loss)

    loss.backward()
Esempio n. 10
0
def main(options):

    _, _, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    _, _, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))

    src_vocab_size = len(src_vocab)
    trg_vocab_size = len(trg_vocab)

    nmt = NMT(src_vocab_size, trg_vocab_size)
    nmt = torch.load(open(options.modelname, 'rb'))
    nmt.eval()

    if torch.cuda.is_available():
        nmt.cuda()
    else:
        nmt.cpu()

    with open('data/output_tanay.txt', 'w') as f_write:
        for i in range(len(src_test)):
            src = to_var(torch.unsqueeze(src_test[i], 1), volatile=True)
            trg = to_var(torch.unsqueeze(trg_test[i], 1), volatile=True)

            results = nmt(src, trg)
            s = ""
            for ix in results:
                idx = np.argmax(ix.data.cpu().numpy())

                if idx == 2:  # if <s>, don't write it
                    continue
                if idx == 3:  # if </s>, end the loop
                    break
                s += trg_vocab.itos[idx] + " "

            #print s.encode('utf-8')
            #if len(s): s += '\n'
            s += '\n'
            f_write.write(s.encode('utf-8'))
Esempio n. 11
0
def main(options):

    src_train, src_dev, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    trg_train, trg_dev, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))

    batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(
        src_train, options.batch_size, src_vocab.stoi["<blank>"])
    batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(
        trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)
    batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(
        src_dev, options.batch_size, src_vocab.stoi["<blank>"])
    batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(
        trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

    src_vocab_size = len(src_vocab)
    trg_vocab_size = len(trg_vocab)

    nmt = NMT(src_vocab_size,
              trg_vocab_size)  # TODO: add more arguments as necessary
    if torch.cuda.is_available():
        nmt.cuda()
    else:
        nmt.cpu()

    criterion = torch.nn.NLLLoss()
    optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(),
                                                         options.learning_rate)

    # main training loop
    last_dev_avg_loss = float("inf")
    for epoch_i in range(options.epochs):
        logging.info("At {0}-th epoch.".format(epoch_i))
        # srange generates a lazy sequence of shuffled range

        for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))):
            train_src_batch = to_var(batched_train_src[batch_i]
                                     )  # of size (src_seq_len, batch_size)
            train_trg_batch = to_var(batched_train_trg[batch_i]
                                     )  # of size (src_seq_len, batch_size)
            train_src_mask = to_var(batched_train_src_mask[batch_i])
            train_trg_mask = to_var(batched_train_trg_mask[batch_i])

            sys_out_batch = nmt(
                train_src_batch, train_trg_batch, training=True
            )  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary
            train_trg_mask = train_trg_mask.view(-1)
            train_trg_batch = train_trg_batch.view(-1)
            train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
            train_trg_mask = train_trg_mask.unsqueeze(1).expand(
                len(train_trg_mask), trg_vocab_size)
            sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
            sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(
                -1, trg_vocab_size)
            loss = criterion(sys_out_batch, train_trg_batch)
            # logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation -- this is a crude esitmation because there might be some paddings at the end
        dev_loss = 0.0

        for batch_i in range(len(batched_dev_src)):
            dev_src_batch = to_var(batched_dev_src[batch_i], volatile=True)
            dev_trg_batch = to_var(batched_dev_trg[batch_i], volatile=True)
            dev_src_mask = to_var(batched_dev_src_mask[batch_i], volatile=True)
            dev_trg_mask = to_var(batched_dev_trg_mask[batch_i], volatile=True)

            sys_out_batch = nmt(
                dev_src_batch, dev_trg_batch
            )  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary
            dev_trg_mask = dev_trg_mask.view(-1)
            dev_trg_batch = dev_trg_batch.view(-1)
            dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
            dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(
                len(dev_trg_mask), trg_vocab_size)
            sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
            sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(
                -1, trg_vocab_size)
            #sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size)
            loss = criterion(sys_out_batch, dev_trg_batch)
            # logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0]))
            dev_loss += loss
        dev_avg_loss = dev_loss / len(batched_dev_src)
        logging.info(
            "Average loss value per instance is {0} at the end of epoch {1}".
            format(dev_avg_loss.data[0], epoch_i))

        # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
        # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0]))
        # break
        torch.save(
            nmt,
            open(
                options.model_file +
                ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i),
                'wb'),
            pickle_module=dill)
        last_dev_avg_loss = dev_avg_loss
Esempio n. 12
0
def main(options):

  use_cuda = (len(options.gpuid) >= 1)
  if options.gpuid:
    cuda.set_device(options.gpuid[0])

  src_train, src_dev, src_test, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb'))
  trg_train, trg_dev, trg_test, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb'))

  batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"])
  batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)
  batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"])
  batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

  trg_vocab_size = len(trg_vocab)
  src_vocab_size = len(src_vocab)
  word_emb_size = 300
  hidden_size = 1024

  nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size,
            src_vocab, trg_vocab, attn_model = "general", use_cuda = True)

  if use_cuda > 0:
    nmt.cuda()
    if options.distributed:
      nmt = torch.nn.DataParallel(nmt)
  else:
    nmt.cpu()

  criterion = torch.nn.NLLLoss()

  # Configure optimization
  lr = options.learning_rate
  optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), lr)

  
  # main training loop
  last_dev_avg_loss = float("inf")
  for epoch_i in range(options.epochs):
    logging.info("At {0}-th epoch.".format(epoch_i))

    # Set training mode
    nmt.train()

    # srange generates a lazy sequence of shuffled range
    for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))):
      train_src_batch = Variable(batched_train_src[batch_i])  # of size (src_seq_len, batch_size)
      train_trg_batch = Variable(batched_train_trg[batch_i])  # of size (src_seq_len, batch_size)
      train_src_mask = Variable(batched_train_src_mask[batch_i])
      train_trg_mask = Variable(batched_train_trg_mask[batch_i])
      if use_cuda:
        train_src_batch = train_src_batch.cuda()
        train_trg_batch = train_trg_batch.cuda()
        train_src_mask = train_src_mask.cuda()
        train_trg_mask = train_trg_mask.cuda()

      sys_out_batch = nmt(train_src_batch, train_trg_batch, True)

      del train_src_batch

      train_trg_mask = train_trg_mask.view(-1)
      train_trg_batch = train_trg_batch.view(-1)
      train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
      train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size)
      sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
      sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size)
      loss = criterion(sys_out_batch, train_trg_batch)
      logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))
      
      optimizer.zero_grad()
      loss.backward()
      # # gradient clipping
      torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0)
      optimizer.step()

    # validation -- this is a crude esitmation because there might be some paddings at the end
    dev_loss = 0.0

    # Set validation mode
    nmt.eval()

    for batch_i in range(len(batched_dev_src)):
      dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True)
      dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True)
      dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True)
      dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True)
      if use_cuda:
        dev_src_batch = dev_src_batch.cuda()
        dev_trg_batch = dev_trg_batch.cuda()
        dev_src_mask = dev_src_mask.cuda()
        dev_trg_mask = dev_trg_mask.cuda()

      sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False)

      dev_trg_mask = dev_trg_mask.view(-1)
      dev_trg_batch = dev_trg_batch.view(-1)
      dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
      dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size)
      sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
      sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size)
      loss = criterion(sys_out_batch, dev_trg_batch)
      logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0]))
      dev_loss += loss
    dev_avg_loss = dev_loss / len(batched_dev_src)
    logging.info("Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss.data[0], epoch_i))

    # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
    #   logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0]))
    #   break
    torch.save(nmt, open(options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill)
    last_dev_avg_loss = dev_avg_loss
Esempio n. 13
0
def init_model(vocab_sizes, use_cuda, model_file_name):
    nmt = NMT(vocab_sizes, use_cuda)

    model.load_state(model_file_name, nmt)
    return nmt
Esempio n. 14
0
def main(options):

    original_model = torch.load(open(options.original_model_file, 'rb'))

    nmt = NMT(original_model)

    use_cuda = (len(options.gpuid) >= 1)
    if options.gpuid:
        cuda.set_device(options.gpuid[0])

    src_train, src_dev, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    trg_train, trg_dev, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))

    batched_test_src, batched_test_src_mask, _ = utils.tensor.advanced_batchize(
        src_test, 1, src_vocab.stoi["<pad>"])
    batched_test_trg, batched_test_trg_mask, _ = utils.tensor.advanced_batchize(
        trg_test, 1, trg_vocab.stoi["<pad>"])

    batched_train_src, batched_train_src_mask, _ = utils.tensor.advanced_batchize(
        src_train, 1, src_vocab.stoi["<pad>"])
    batched_train_trg, batched_train_trg_mask, _ = utils.tensor.advanced_batchize(
        trg_train, 1, trg_vocab.stoi["<pad>"])
    batched_dev_src, batched_dev_src_mask, _ = utils.tensor.advanced_batchize(
        src_dev, options.batch_size, src_vocab.stoi["<pad>"])
    batched_dev_trg, batched_dev_trg_mask, _ = utils.tensor.advanced_batchize(
        trg_dev, options.batch_size, trg_vocab.stoi["<pad>"])

    trg_vocab_size = len(trg_vocab)
    src_vocab_size = len(src_vocab)

    # for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))):
    #     train_src_batch = Variable(batched_train_src[batch_i])  # of size (src_seq_len, batch_size)
    #     train_trg_batch = Variable(batched_train_trg[batch_i])  # of size (src_seq_len, batch_size)
    #     train_src_mask = Variable(batched_train_src_mask[batch_i])
    #     train_trg_mask = Variable(batched_train_trg_mask[batch_i])
    #     if use_cuda:
    #       train_src_batch = train_src_batch.cuda()
    #       train_trg_batch = train_trg_batch.cuda()
    #       train_src_mask = train_src_mask.cuda()
    #       train_trg_mask = train_trg_mask.cuda()

    #     sys_out_batch = nmt(train_src_batch, train_trg_batch.size()[0])
    #     # print(sys_out_batch.size())
    #     _, sys_out_batch = torch.max(sys_out_batch, dim=2)
    #     sys_out_batch = sys_out_batch.view(-1)
    #     sent = []
    #     # print(sys_out_batch)
    #     for w in sys_out_batch:
    #       # print(w)
    #       sent.append(trg_vocab.itos[w.data[0]])
    #     print(sent)

    # # Initialize encoder with weights parameters from original model
    # encoder = nn.LSTM(300, 512, bidirectional=True)

    # encoder.weight_ih_l0 = nn.Parameter(original_model['encoder.rnn.weight_ih_l0'])
    # encoder.weight_hh_l0 = nn.Parameter(original_model['encoder.rnn.weight_hh_l0'])
    # encoder.bias_ih_l0 = nn.Parameter(original_model['encoder.rnn.bias_ih_l0'])
    # encoder.bias_hh_l0 = nn.Parameter(original_model['encoder.rnn.bias_hh_l0'])

    # encoder.weight_ih_l0_reverse = nn.Parameter(original_model['encoder.rnn.weight_ih_l0_reverse'])
    # encoder.weight_hh_l0_reverse = nn.Parameter(original_model['encoder.rnn.weight_hh_l0_reverse'])
    # encoder.bias_ih_l0_reverse = nn.Parameter(original_model['encoder.rnn.bias_ih_l0_reverse'])
    # encoder.bias_hh_l0_reverse = nn.Parameter(original_model['encoder.rnn.bias_hh_l0_reverse'])

    # # Initialize decoder with weights parameters from original model
    # decoder = nn.LSTM(1324, 1024)

    # decoder.weight_ih_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.weight_ih'])
    # decoder.weight_hh_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.weight_hh'])
    # decoder.bias_ih_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.bias_ih'])
    # decoder.bias_hh_l0 = nn.Parameter(original_model['decoder.rnn.layers.0.bias_hh'])

    # if use_cuda > 0:
    #   encoder.cuda()
    #   decoder.cuda()
    # else:
    #   encoder.cpu()
    #   decoder.cpu()

    # # Initialize embeddings
    # encoder_embedding = nn.Embedding(36616, 300)
    # decoder_embedding = nn.Embedding(23262, 300)
    # encoder_embedding.weight = nn.Parameter(original_model['encoder.embeddings.emb_luts.0.weight'])
    # decoder_embedding.weight = nn.Parameter(original_model['decoder.embeddings.emb_luts.0.weight'])

    # # Initialize Ws
    # wi = nn.Linear(1024,1024, bias=False)
    # wi.weight = nn.Parameter(original_model['decoder.attn.linear_in.weight'])

    # wo = nn.Linear(2048, 1024, bias=False)
    # wo.weight = nn.Parameter(original_model['decoder.attn.linear_out.weight'])

    # generator = nn.Linear(1024, 23262)
    # generator.weight = nn.Parameter(original_model['0.weight'])
    # generator.bias = nn.Parameter(original_model['0.bias'])

    criterion = torch.nn.NLLLoss()
    # encoder_optimizer = eval("torch.optim." + options.optimizer)(encoder.parameters(), options.learning_rate)
    # decoder_optimizer = eval("torch.optim." + options.optimizer)(decoder.parameters(), options.learning_rate)

    # soft_max = nn.Softmax()

    optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(),
                                                         options.learning_rate)

    # main training loop
    last_dev_avg_loss = float("inf")
    for epoch_i in range(options.epochs):
        logging.info("At {0}-th epoch.".format(epoch_i))

        h_t_1 = Variable(torch.ones(1024))

        # srange generates a lazy sequence of shuffled range
        for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))):
            train_src_batch = Variable(batched_train_src[batch_i]
                                       )  # of size (src_seq_len, batch_size)
            train_trg_batch = Variable(batched_train_trg[batch_i]
                                       )  # of size (src_seq_len, batch_size)
            train_src_mask = Variable(batched_train_src_mask[batch_i])
            train_trg_mask = Variable(batched_train_trg_mask[batch_i])
            if use_cuda:
                train_src_batch = train_src_batch.cuda()
                train_trg_batch = train_trg_batch.cuda()
                train_src_mask = train_src_mask.cuda()
                train_trg_mask = train_trg_mask.cuda()

            # encoder_input = encoder_embedding(train_trg_batch)
            # sys_out_batch, (encoder_hidden_states, _) = encoder(encoder_input)  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary

            # h = Variable(torch.FloatTensor(sys_out_batch.size()[1], 1024).fill_(1./1024))
            # c = Variable(torch.FloatTensor(sys_out_batch.size()[1], 1024).fill_(0))

            # softmax = torch.nn.Softmax()
            # tanh = torch.nn.Tanh()
            # # _,w = torch.max(softmax(generator(h)), dim=1)
            # w = softmax(generator(h))

            # result = Variable(torch.FloatTensor(sys_out_batch.size()[0], sys_out_batch.size()[1], 23262))
            # for i in range(sys_out_batch.size()[0]):
            #   wht1 = wi(h).view(1, -1, 1024).expand_as(sys_out_batch)

            #   score = softmax(torch.sum(sys_out_batch * wht1, dim=2)).view(sys_out_batch.size()[0],sys_out_batch.size()[1],1)

            #   st = torch.sum(score * sys_out_batch, dim=0)
            #   ct = tanh(wo(torch.cat([st, h], dim=1)))

            #   _, w = torch.max(w, dim=1)
            #   input = torch.cat([decoder_embedding(w), ct], dim=1)
            #   input = input.view(1, input.size()[0], input.size()[1])

            #   _,(b,c) = decoder(input, (h,c))
            #   h = b[0]
            #   c = c[0]

            #   w = softmax(generator(h))
            #   result[i] = w
            # # result.append(w)
            # sys_out_batch = result
            sys_out_batch = nmt(train_src_batch, train_trg_batch.size()[0])
            # s_vector = []
            # for hs in sys_out_batch:
            #   score = hs.matmul(wi).matmul(h_t_1)
            #   score = score.unsqueeze(0)
            #   a_h_s = soft_max(score)
            #   # print a_h_s, hs.squeeze(0)
            #   s_vector.append(a_h_s.squeeze(0).dot(hs.squeeze(0)))
            # s_tilda = sum(s_vector)
            # c_t = nn.Tanh(wo.matmul(torch.cat(s_tilda, h_t_1)))

            # sys.exit()
            # train_trg_mask = train_trg_mask.view(-1)
            # train_trg_batch = train_trg_batch.view(-1)
            # train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
            # train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size)
            # sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
            # sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size)
            print(train_trg_mask.size())
            train_trg_mask = train_trg_mask.view(-1)
            train_trg_batch = train_trg_batch.view(-1)
            train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
            train_trg_mask = train_trg_mask.unsqueeze(1).expand(
                len(train_trg_mask), trg_vocab_size - 1)
            # print(trainin.size())
            # print(train_trg_batch[:,:-1].size())
            sys_out_batch = sys_out_batch.view(-1, trg_vocab_size - 1)
            print(trg_vocab_size)
            print(train_trg_mask.size())
            sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(
                -1, trg_vocab_size - 1)
            print(sys_out_batch.size())

            loss = criterion(sys_out_batch, train_trg_batch)
            logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation -- this is a crude esitmation because there might be some paddings at the end
        dev_loss = 0.0
        for batch_i in range(len(batched_dev_src)):
            dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True)
            dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True)
            dev_src_mask = Variable(batched_dev_src_mask[batch_i],
                                    volatile=True)
            dev_trg_mask = Variable(batched_dev_trg_mask[batch_i],
                                    volatile=True)
            if use_cuda:
                dev_src_batch = dev_src_batch.cuda()
                dev_trg_batch = dev_trg_batch.cuda()
                dev_src_mask = dev_src_mask.cuda()
                dev_trg_mask = dev_trg_mask.cuda()

            # encoder_input = encoder_embedding(dev_trg_batch)
            # sys_out_batch = encoder(encoder_input)  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary
            sys_out_batch = nmt(dev_src_batch, dev_trg_batch.size()[0])

            dev_trg_mask = dev_trg_mask.view(-1)
            dev_trg_batch = dev_trg_batch.view(-1)
            dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
            dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(
                len(dev_trg_mask), trg_vocab_size - 1)

            sys_out_batch = sys_out_batch.view(-1, trg_vocab_size - 1)
            sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(
                -1, trg_vocab_size - 1)

            loss = criterion(sys_out_batch, dev_trg_batch)
            logging.debug("dev loss at batch {0}: {1}".format(
                batch_i, loss.data[0]))
            dev_loss += loss

        dev_avg_loss = dev_loss / len(batched_dev_in)
        logging.info(
            "Average loss value per instance is {0} at the end of epoch {1}".
            format(dev_avg_loss.data[0], epoch_i))

        if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
            logging.info(
                "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})"
                .format(epoch_i, last_dev_avg_loss.data[0],
                        dev_avg_loss.data[0]))
            break

        torch.save(
            nmt.state_dict(),
            open(
                options.model_file +
                ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i),
                'wb'),
            pickle_module=dill)
        last_dev_avg_loss = dev_avg_loss
Esempio n. 15
0
def main(options):

    use_cuda = (len(options.gpuid) >= 1)
    # if options.gpuid:
    #   cuda.set_device(options.gpuid[0])

    src_train, src_dev, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    trg_train, trg_dev, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))

    batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(
        src_train, options.batch_size, src_vocab.stoi["<blank>"])
    batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(
        trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)
    batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(
        src_dev, options.batch_size, src_vocab.stoi["<blank>"])
    batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(
        trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

    print "preprocessing batched data..."
    processed_src = list()
    processed_trg = list()
    processed_src_mask = list()
    processed_trg_mask = list()
    for batch_i in range(len(batched_train_src)):
        if batched_train_src[batch_i].size(
                0) <= 35 and batched_train_trg[batch_i].size(0) <= 35:
            processed_src.append(batched_train_src[batch_i])
            processed_trg.append(batched_train_trg[batch_i])
            processed_src_mask.append(batched_train_src_mask[batch_i])
            processed_trg_mask.append(batched_train_trg_mask[batch_i])

    batched_train_src = processed_src
    batched_train_trg = processed_trg
    batched_train_src_mask = processed_src_mask
    batched_train_trg_mask = processed_trg_mask

    processed_src = list()
    processed_trg = list()
    processed_src_mask = list()
    processed_trg_mask = list()
    for batch_i in range(len(batched_dev_src)):
        if batched_dev_src[batch_i].size(
                0) <= 35 and batched_dev_trg[batch_i].size(0) <= 35:
            processed_src.append(batched_dev_src[batch_i])
            processed_trg.append(batched_dev_trg[batch_i])
            processed_src_mask.append(batched_dev_src_mask[batch_i])
            processed_trg_mask.append(batched_dev_trg_mask[batch_i])

    batched_dev_src = processed_src
    batched_dev_trg = processed_trg
    batched_dev_src_mask = processed_src_mask
    batched_dev_trg_mask = processed_trg_mask

    del processed_src, processed_trg, processed_trg_mask, processed_src_mask

    trg_vocab_size = len(trg_vocab)
    src_vocab_size = len(src_vocab)
    word_emb_size = 50
    hidden_size = 1024

    nmt = NMT(src_vocab_size,
              trg_vocab_size,
              word_emb_size,
              hidden_size,
              src_vocab,
              trg_vocab,
              attn_model="general",
              use_cuda=True)
    discriminator = Discriminator(src_vocab_size,
                                  trg_vocab_size,
                                  word_emb_size,
                                  src_vocab,
                                  trg_vocab,
                                  use_cuda=True)

    if use_cuda > 0:
        #nmt = torch.nn.DataParallel(nmt,device_ids=options.gpuid).cuda()
        nmt.cuda()
        #discriminator = torch.nn.DataParallel(discriminator,device_ids=options.gpuid).cuda()
        discriminator.cuda()
    else:
        nmt.cpu()
        discriminator.cpu()

    criterion_g = torch.nn.NLLLoss().cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()

    # Configure optimization
    optimizer_g = eval("torch.optim." + options.optimizer)(
        nmt.parameters(), options.learning_rate)
    optimizer_d = eval("torch.optim." + options.optimizer)(
        discriminator.parameters(), options.learning_rate)

    # main training loop
    f1 = open("train_loss", "a")
    f2 = open("dev_loss", "a")
    last_dev_avg_loss = float("inf")
    for epoch_i in range(options.epochs):
        logging.info("At {0}-th epoch.".format(epoch_i))
        # srange generates a lazy sequence of shuffled range

        train_loss_g = 0.0
        train_loss_d = 0.0
        train_loss_g_nll = 0.0
        train_loss_g_ce = 0.0
        train_loss_nll_batch_num = 0
        train_loss_ce_batch_num = 0
        for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))):
            if i == 1500:
                break
            # if i==5:
            #   break
            train_src_batch = Variable(batched_train_src[batch_i]
                                       )  # of size (src_seq_len, batch_size)
            train_trg_batch = Variable(batched_train_trg[batch_i]
                                       )  # of size (src_seq_len, batch_size)
            train_src_mask = Variable(batched_train_src_mask[batch_i])
            train_trg_mask = Variable(batched_train_trg_mask[batch_i])
            if use_cuda:
                train_src_batch = train_src_batch.cuda()
                train_trg_batch = train_trg_batch.cuda()
                train_src_mask = train_src_mask.cuda()
                train_trg_mask = train_trg_mask.cuda()

            # train discriminator
            sys_out_batch = nmt(train_src_batch, train_trg_batch,
                                True).detach()
            _, predict_batch = sys_out_batch.topk(1)
            del _
            predict_batch = predict_batch.squeeze(2)
            real_dis_label_out = discriminator(train_src_batch,
                                               train_trg_batch, True)
            fake_dis_label_out = discriminator(train_src_batch, predict_batch,
                                               True)
            optimizer_d.zero_grad()
            loss_d_real = criterion(
                real_dis_label_out,
                Variable(
                    torch.ones(options.batch_size *
                               len(options.gpuid)).long()).cuda())
            loss_d_real.backward()
            loss_d_fake = criterion(
                fake_dis_label_out,
                Variable(
                    torch.zeros(options.batch_size *
                                len(options.gpuid)).long()).cuda())
            #loss_d_fake.backward(retain_graph=True)
            loss_d_fake.backward()
            loss_d = loss_d_fake.data[0] + loss_d_real.data[0]
            del loss_d_fake, loss_d_real
            logging.debug("D loss at batch {0}: {1}".format(i, loss_d))
            f1.write("D train loss at batch {0}: {1}\n".format(i, loss_d))
            optimizer_d.step()

            if use_cuda > 0:
                sys_out_batch = sys_out_batch.cuda()
                train_trg_batch = train_trg_batch.cuda()
            else:
                sys_out_batch = sys_out_batch.cpu()
                train_trg_batch = train_trg_batch.cpu()

            # train nmt
            sys_out_batch = nmt(train_src_batch, train_trg_batch, True)
            _, predict_batch = sys_out_batch.topk(1)
            predict_batch = predict_batch.squeeze(2)
            fake_dis_label_out = discriminator(train_src_batch, predict_batch,
                                               True)
            if random.random() > 0.5:
                train_trg_mask = train_trg_mask.view(-1)
                train_trg_batch = train_trg_batch.view(-1)
                train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
                train_trg_mask = train_trg_mask.unsqueeze(1).expand(
                    len(train_trg_mask), trg_vocab_size)
                sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
                sys_out_batch = sys_out_batch.masked_select(
                    train_trg_mask).view(-1, trg_vocab_size)
                loss_g = criterion_g(sys_out_batch, train_trg_batch)
                train_loss_g_nll += loss_g
                train_loss_nll_batch_num += 1
                f1.write("G train NLL loss at batch {0}: {1}\n".format(
                    i, loss_g.data[0]))
            else:
                loss_g = criterion(
                    fake_dis_label_out,
                    Variable(
                        torch.ones(options.batch_size *
                                   len(options.gpuid)).long()).cuda())
                train_loss_g_ce += loss_g
                train_loss_ce_batch_num += 1
                f1.write("G train CE loss at batch {0}: {1}\n".format(
                    i, loss_g.data[0]))

            logging.debug("G loss at batch {0}: {1}".format(i, loss_g.data[0]))

            optimizer_g.zero_grad()
            loss_g.backward()

            # # gradient clipping
            torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0)
            optimizer_g.step()

            train_loss_d += loss_d
        train_avg_loss_g_nll = train_loss_g_nll / train_loss_nll_batch_num
        train_avg_loss_g_ce = train_loss_g_ce / train_loss_ce_batch_num
        train_avg_loss_d = train_loss_d / len(train_src_batch)
        logging.info(
            "G TRAIN Average NLL loss value per instance is {0} at the end of epoch {1}"
            .format(train_avg_loss_g_nll, epoch_i))
        logging.info(
            "G TRAIN Average CE loss value per instance is {0} at the end of epoch {1}"
            .format(train_avg_loss_g_ce, epoch_i))
        logging.info(
            "D TRAIN Average loss value per instance is {0} at the end of epoch {1}"
            .format(train_avg_loss_d, epoch_i))

        # validation -- this is a crude esitmation because there might be some paddings at the end
        # dev_loss_g_nll = 0.0
        # dev_loss_g_ce = 0.0
        # dev_loss_d = 0.0

        # for batch_i in range(len(batched_dev_src)):
        #   dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True)
        #   dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True)
        #   dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True)
        #   dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True)
        #   if use_cuda:
        #     dev_src_batch = dev_src_batch.cuda()
        #     dev_trg_batch = dev_trg_batch.cuda()
        #     dev_src_mask = dev_src_mask.cuda()
        #     dev_trg_mask = dev_trg_mask.cuda()

        #   sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False).detach()
        #   _,predict_batch = sys_out_batch.topk(1)
        #   predict_batch = predict_batch.squeeze(2)
        #   real_dis_label_out = discriminator(dev_src_batch, dev_trg_batch, True).detach()
        #   fake_dis_label_out = discriminator(dev_src_batch, predict_batch, True).detach()

        #   if use_cuda > 0:
        #     sys_out_batch = sys_out_batch.cuda()
        #     dev_trg_batch = dev_trg_batch.cuda()
        #   else:
        #     sys_out_batch = sys_out_batch.cpu()
        #     dev_trg_batch = dev_trg_batch.cpu()

        #   dev_trg_mask = dev_trg_mask.view(-1)
        #   dev_trg_batch = dev_trg_batch.view(-1)
        #   dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
        #   dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size)
        #   sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
        #   sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size)
        #   loss_g_nll = criterion_g(sys_out_batch, dev_trg_batch)
        #   loss_g_ce = criterion(fake_dis_label_out, Variable(torch.ones(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda())
        #   loss_d = criterion(real_dis_label_out, Variable(torch.ones(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) + criterion(fake_dis_label_out, Variable(torch.zeros(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda())
        #   logging.debug("G dev NLL loss at batch {0}: {1}".format(batch_i, loss_g_nll.data[0]))
        #   logging.debug("G dev CE loss at batch {0}: {1}".format(batch_i, loss_g_ce.data[0]))
        #   f2.write("G dev NLL loss at batch {0}: {1}\n".format(batch_i, loss_g_nll.data[0]))
        #   f2.write("G dev CE loss at batch {0}: {1}\n".format(batch_i, loss_g_ce.data[0]))
        #   logging.debug("D dev loss at batch {0}: {1}".format(batch_i, loss_d.data[0]))
        #   f2.write("D dev loss at batch {0}: {1}\n".format(batch_i, loss_d.data[0]))
        #   dev_loss_g_nll += loss_g_nll
        #   dev_loss_g_ce += loss_g_ce
        #   dev_loss_d += loss_d
        # dev_avg_loss_g_nll = dev_loss_g_nll / len(batched_dev_src)
        # dev_avg_loss_g_ce = dev_loss_g_ce / len(batched_dev_src)
        # dev_avg_loss_d = dev_loss_d / len(batched_dev_src)
        # logging.info("G DEV Average NLL loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_g_nll.cpu().data[0], epoch_i))
        # logging.info("G DEV Average CE loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_g_ce.cpu().data[0], epoch_i))
        # logging.info("D DEV Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_d.data[0], epoch_i))
        # # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
        # #   logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0]))
        # #   break
    torch.save(nmt,
               open(
                   "nmt.nll_{0:.2f}.epoch_{1}".format(
                       train_avg_loss_g_nll.cpu().data[0], epoch_i), 'wb'),
               pickle_module=dill)
    torch.save(discriminator,
               open(
                   "discriminator.nll_{0:.2f}.epoch_{1}".format(
                       train_avg_loss_d.data[0], epoch_i), 'wb'),
               pickle_module=dill)
    f1.close()
    f2.close()
Esempio n. 16
0
def train(args: Dict):
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    # [(src_0, tgt_0), (src_1, tgt_1), ..., ]
    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    # vocab = Vocab.load(args['--vocab'])
    vocab = Vocab.build(train_data_src, train_data_tgt,
                        int(args['--vocab-size']), 1)

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    model.train()
    print(model)

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    # vocab_mask = torch.ones(len(vocab.tgt))
    # vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)
    model.save(model_save_path)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()

    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            #################### forward pass and compute loss #########################
            # example_losses = -model(src_sents, tgt_sents) # (batch_size,)
            example_losses = model(src_sents, tgt_sents)  # [batch_size,]
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            #################### backward pass to compute gradients ####################
            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            #################### update model parameters ###############################
            optimizer.step()

            #################### do some statistics ####################################
            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            #################### print log #############################################
            if train_iter % log_every == 0:
                print(
                    'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f '
                    'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec'
                    % (
                        epoch,
                        train_iter,
                        report_loss / report_examples,
                        #  math.exp(report_loss / report_tgt_words),
                        (report_loss / report_tgt_words),
                        cum_examples,
                        report_tgt_words / (time.time() - train_time),
                        time.time() - begin_time),
                    file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            ##################### perform validation ##################################
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(
                    model, dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                # hypotheses = beam_search(model, dev_data_src,
                #                          beam_size=4,
                #                          max_decoding_time_step=10)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
Esempio n. 17
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    do_bleu = '--ignore-test-bleu' not in args or not args['--ignore-test-bleu']
    train_data_src = read_corpus(args['--train-src'],
                                 source='src',
                                 dev_mode=dev_mode)
    train_data_tgt = read_corpus(args['--train-tgt'],
                                 source='tgt',
                                 dev_mode=dev_mode)

    dev_data_src = read_corpus(args['--dev-src'],
                               source='src',
                               dev_mode=dev_mode)
    dev_data_tgt = read_corpus(args['--dev-tgt'],
                               source='tgt',
                               dev_mode=dev_mode)

    if do_bleu:
        test_data_src = read_corpus(args['--test-src'],
                                    source='src',
                                    dev_mode=dev_mode)
        test_data_tgt = read_corpus(args['--test-tgt'],
                                    source='tgt',
                                    dev_mode=dev_mode)

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    max_tokens_in_sentence = int(args['--max-decoding-time-step'])
    train_data = clean_data(train_data, max_tokens_in_sentence)
    dev_data = clean_data(dev_data, max_tokens_in_sentence)

    train_batch_size = int(args['--batch-size'])
    dev_batch_size = 128
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    bleu_niter = int(args['--bleu-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'], args['--word_freq'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    writer = SummaryWriter()

    # model = TransformerNMT(vocab, num_hidden_layers=3)

    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)
            else:
                p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()

    print("Sorting dataset based on difficulty...")
    dataset = (train_data, dev_data)
    ordered_dataset = load_order(args['--order-name'], dataset, vocab)
    # TODO: order = balance_order(order, dataset)
    (train_data, dev_data) = ordered_dataset

    visualize_scoring_examples = False
    if visualize_scoring_examples:
        visualize_scoring(ordered_dataset, vocab)

    n_iters = math.ceil(len(train_data) / train_batch_size)
    print("n_iters per epoch is {}: ({} / {})".format(n_iters, len(train_data),
                                                      train_batch_size))
    max_epoch = int(args['--max-epoch'])
    max_iters = max_epoch * n_iters

    print('begin Maximum Likelihood training')
    print('Using order function: {}'.format(args['--order-name']))
    print('Using pacing function: {}'.format(args['--pacing-name']))
    while True:
        epoch += 1
        for _ in range(n_iters):
            # Get pacing data according to train_iter
            current_train_data, current_dev_data = pacing_data(
                train_data,
                dev_data,
                time=train_iter,
                warmup_iters=int(args["--warmup-iters"]),
                method=args['--pacing-name'],
                tb=writer)

            # Uniformly sample batches from the paced dataset
            src_sents, tgt_sents = get_pacing_batch(
                current_train_data, batch_size=train_batch_size, shuffle=True)

            train_iter += 1

            # ERROR START
            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()
            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val: int = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print(
                    'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f '
                    'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec'
                    % (epoch, train_iter, report_loss / report_examples,
                       math.exp(report_loss / report_tgt_words), cum_examples,
                       report_tgt_words /
                       (time.time() - train_time), time.time() - begin_time),
                    file=sys.stderr)
                writer.add_scalar('Loss/train', report_loss / report_examples,
                                  train_iter)
                writer.add_scalar('ppl/train',
                                  math.exp(report_loss / report_tgt_words),
                                  train_iter)
                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # evaluate BLEU
            if train_iter % bleu_niter == 0 and do_bleu:
                bleu = decode_with_params(
                    model, test_data_src, test_data_tgt,
                    int(args['--beam-size']),
                    int(args['--max-decoding-time-step']))
                writer.add_scalar('bleu/test', bleu, train_iter)

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                # dev batch size can be a bit larger
                dev_ppl = evaluate_ppl(model,
                                       current_dev_data,
                                       batch_size=dev_batch_size)
                valid_metric = -dev_ppl
                writer.add_scalar('ppl/valid', dev_ppl, train_iter)
                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * \
                            float(args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch >= int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
def decode(args):
    option, values = load_model(args.model)
    #option, values = load_average_model(args.model)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    svocabs, tvocabs = option["vocabulary"]
    svocab, isvocab = svocabs
    tvocab, itvocab = tvocabs

    unk_sym = option["unk"]
    eos_sym = option["eos"]

    source_word2vec, target_word2vec = option["word2vecs"]

    count = 0

    doption = {
        "maxlen": args.maxlen,
        "minlen": args.minlen,
        "beamsize": args.beamsize,
        "normalize": args.normalize
    }

    # create graph
    model = NMT(option["num_layers"], option["num_heads"],
                option["attention_dropout"], option["residual_dropout"],
                option["relu_dropout"],
                option["embedding"], option["hidden"], option["filter"],
                len(isvocab), len(itvocab), source_word2vec, target_word2vec)

    model.option = option

    input_file = open(args.corpus, 'r')
    output_file = open(args.translation, 'w')

    with tf.Session(config=config):
        tf.global_variables_initializer().run()
        set_variables(tf.trainable_variables(), values)

        line = input_file.readline()
        while line:
            line_list = line.split()
            data = [line]
            seq, _, seq_len = convert_data(data, svocab, unk_sym, eos_sym)
            t1 = time.time()
            tlist = beamsearch(model, seq, seq_len, **doption)
            t2 = time.time()

            if len(tlist) == 0:
                sys.stdout.write("\n")
                score = -10000.0
            else:
                best, score = tlist[0]
                output_file.write(" ".join(best[:-1]))
                output_file.write("\n")

            count = count + 1
            sys.stderr.write(str(count) + " ")
            sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n")
            line = input_file.readline()
    output_file.close()
    input_file.close()
def train(args):
    option = default_option()

    # predefined model names
    pathname, basename = os.path.split(args.model)
    modelname = get_filename(basename)
    autoname = os.path.join(pathname, modelname + ".autosave.pkl")
    bestname = os.path.join(pathname, modelname + ".best.pkl")

    # load models
    if os.path.exists(args.model):
        opt, params = load_model(args.model)
        override(option, opt)
        init = False
    else:
        init = True
        params = None

    override(option, args_to_dict(args))
    print_option(option)

    # load references
    if option["references"]:
        references = load_references(option["references"])
    else:
        references = None

    # input corpus
    batch = option["batch"]
    sortk = option["sort"] or 1
    shuffle = option["seed"] if option["shuffle"] else None
    reader = TextReader(option["corpus"], shuffle)
    processor = [data_length, data_length]
    stream = TextIterator(reader, [batch, batch * sortk], processor,
                          option["limit"], option["sort"])

    if shuffle and option["indices"] is not None:
        reader.set_indices(option["indices"])

    if args.reset:
        option["count"] = [0, 0]
        option["epoch"] = 0
        option["cost"] = 0.0

    skip_stream(reader, option["count"][1])

    # beamsearch option
    search_opt = {
        "beamsize": option["beamsize"],
        "normalize": option["normalize"],
        "maxlen": option["maxlen"],
        "minlen": option["minlen"]
    }

    # misc
    svocabs, tvocabs = option["vocabulary"]
    svocab, isvocab = svocabs
    tvocab, itvocab = tvocabs
    unk = option["unk"]
    eos = option["eos"]

    source_word2vec, target_word2vec = option["word2vecs"]

    scale = option["scale"]

    # set seed
    np.random.seed(option["seed"])
    tf.set_random_seed(option["seed"])

    initializer = tf.random_uniform_initializer(-scale, scale)
    model = NMT(option["num_layers"],
                option["num_heads"],
                option["attention_dropout"],
                option["residual_dropout"],
                option["relu_dropout"],
                option["embedding"],
                option["hidden"],
                option["filter"],
                len(isvocab),
                len(itvocab),
                source_word2vec,
                target_word2vec,
                initializer=initializer)

    model.option = option

    # create optimizer
    optim = Optimizer(model,
                      algorithm=option["optimizer"],
                      norm=True,
                      constraint=("norm", option["norm"]))

    # create session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config=config):
        tf.global_variables_initializer().run()

        print "parameters:", count_parameters(tf.trainable_variables())

        if not init:
            set_variables(tf.trainable_variables(), params)

        def lr_decay_fn(*args, **kwargs):
            global_step = kwargs["global_step"]
            step = kwargs["step"]
            epoch = kwargs["epoch"]
            option["alpha"] = option["alpha"] * option["decay"]
            msg = "G/E/S: %d/%d/%d  alpha: %f"
            print(msg % (global_step, epoch, step, option["alpha"]))

        def train_step_fn(data, **variables):
            alpha = option["alpha"]
            global_step = variables["global_step"]
            step = variables["step"]
            epoch = variables["epoch"]

            xdata, _, xlen = convert_data(data[0], svocab, unk, eos)
            ydata, _, ylen = convert_data(data[1], tvocab, unk, eos)

            t1 = time.time()
            cost, norm = optim.optimize(xdata, xlen, ydata, ylen)
            alpha = (1 / float(option["embedding"])**0.5) * min(
                1 / float(global_step)**0.5,
                global_step / float(option["warmup"])**1.5)
            optim.update(alpha=alpha)
            t2 = time.time()
            #cost = cost * len(ylen) / sum(ylen)
            msg = "G/E/S: %d/%d/%d cost: %f norm: %f time: %f"
            print(msg % (global_step, epoch, step, cost, norm, t2 - t1))

            return cost / math.log(2)

        def sample_fn(*args, **kwargs):
            data = args[0]
            batch = len(data[0])
            ind = np.random.randint(0, batch)
            sdata = data[0][ind]
            tdata = data[1][ind]
            xdata, _, xlen = convert_data(data[0], svocab, unk, eos)
            xdata = xdata[ind:ind + 1, :]
            xlen = xlen[ind:ind + 1]
            hls = beamsearch(model, xdata, xlen, **search_opt)
            best, score = hls[0]
            print("> " + sdata)
            print("> " + tdata)
            print("> " + " ".join(best[:-1]))

        def cost_summary(*args, **kwargs):
            cost = kwargs["local_cost"]
            global_cost = kwargs["global_cost"]
            step = kwargs["local_step"]
            global_step = kwargs["global_step"]

            ac, gac = cost / step, global_cost / global_step

            print("averaged cost: %f/%f" % (ac, gac))

        def stop_fn(*args, **kwargs):
            if option["maxepoch"] < kwargs["epoch"]:
                raise StopIteration

        def save_fn(*args, **kwargs):
            save_model(model, autoname, reader, option, **kwargs)

        def validate_fn(*args, **kwargs):
            if option["validation"] and references:
                validate_model(model, option["validation"], references,
                               search_opt, bestname, reader, option, **kwargs)

        # global/epoch
        lr_decay_hook = ops.train_loop.hook(option["stop"], 1, lr_decay_fn)
        # local
        save_hook = ops.train_loop.hook(0, option["freq"], save_fn)
        e_save_hook = ops.train_loop.hook(0, 2, save_fn)
        # local
        sample_hook = ops.train_loop.hook(0, option["sfreq"], sample_fn)
        # global/local/epoch
        validate_hook = ops.train_loop.hook(0, option["vfreq"], validate_fn)
        e_validate_hook = ops.train_loop.hook(0, 1, validate_fn)
        # epoch
        cost_summary_hook = ops.train_loop.hook(0, 1, cost_summary)
        # global/epoch
        stop_hook = ops.train_loop.hook(0, 1, stop_fn)

        global_level_hooks = []
        local_level_hooks = [save_hook, sample_hook, validate_hook]
        epoch_level_hooks = [
            lr_decay_hook, cost_summary_hook, e_save_hook, e_validate_hook,
            stop_hook
        ]

        ops.train_loop.train_loop(stream, train_step_fn, option,
                                  global_level_hooks, local_level_hooks,
                                  epoch_level_hooks)

    stream.close()
                        en_val,
                        de_val,
                        epoch=epoch,
                        file_id=file_id,
                        file_nums=num_splits)
            file_id += 1


#model=main()
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', default='train', help="train or infer ?")
    args = parser.parse_args()
    X_word2idx, X_idx2word, Y_word2idx, Y_idx2word = load_idx()
    en_val, de_val = load_val()
    model = NMT(X_word2idx=X_word2idx,
                X_idx2word=X_idx2word,
                Y_word2idx=Y_word2idx,
                Y_idx2word=Y_idx2word)
    if args.mode == "train":  #python train_skip.py --mode train
        print("start training the model...")
        train()
    elif args.mode == "test":  #python train_skip.py --mode test
        print("start inferring....")
        en_val = load_val_large()
        with open("translation_result", 'w') as f:
            for en in en_val:
                output = model.infer(en)
                f.write(output + "\n")
            f.close()
Esempio n. 21
0
class Trainer:
    """
    训练类,使用训练集训练模型

    Args:
        _hparams (NameSpace): 人为设定的超参数,默认值见config.py,也可以在命令行指定。
    """

    def __init__(self, _hparams):
        self.hparams = _hparams
        set_seed(_hparams.fixed_seed)
        self.train_loader = get_dataloader(_hparams.train_src_path, _hparams.train_dst_path,
                                           _hparams.batch_size, _hparams.num_workers)
        self.src_vocab, self.dst_vocab = load_vocab(_hparams.train_src_pkl, _hparams.train_dst_pkl)
        self.device = torch.device(_hparams.device)
        self.model = NMT(_hparams.embed_size, _hparams.hidden_size,
                         self.src_vocab, self.dst_vocab, self.device,
                         _hparams.dropout_rate).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=_hparams.lr)

    def train(self):
        print('*' * 20, 'train', '*' * 20)
        hist_valid_scores = []
        patience = 0
        num_trial = 0

        for epoch in range(int(self.hparams.max_epochs)):
            self.model.train()

            epoch_loss_val = 0
            epoch_steps = len(self.train_loader)
            for step, data_pairs in tqdm(enumerate(self.train_loader)):
                sents = [(dp.src, dp.dst) for dp in data_pairs]
                src_sents, tgt_sents = zip(*sents)

                self.optimizer.zero_grad()

                batch_size = len(src_sents)
                example_losses = -self.model(src_sents, tgt_sents)
                batch_loss = example_losses.sum()
                train_loss = batch_loss / batch_size
                epoch_loss_val += train_loss.item()
                train_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.hparams.clip_gradient)
                self.optimizer.step()

            epoch_loss_val /= epoch_steps
            print('epoch: {}, epoch_loss_val: {}'.format(epoch, epoch_loss_val))

            # perform validation
            if epoch % self.hparams.valid_niter == 0:
                print('*' * 20, 'validate', '*' * 20)
                dev_ppl = evaluate_ppl(self.model, self.hparams.val_src_path, self.hparams.val_dst_path,
                                       self.hparams.batch_val_size, self.hparams.num_workers)
                valid_metric = -dev_ppl

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to {}'.format(self.hparams.model_save_path))
                    self.model.save(self.hparams.model_save_path)
                    torch.save(self.optimizer.state_dict(), self.hparams.optimizer_save_path)
                elif patience < self.hparams.patience:
                    patience += 1
                    print('hit patience %d' % patience)

                    if patience == self.hparams.patience:
                        num_trial += 1
                        print('hit #{} trial'.format(num_trial))
                        if num_trial == self.hparams.max_num_trial:
                            print('early stop!')
                            exit(0)

                        # 兼容设计,考虑Adam不需要人工调整lr,而其他优化器需要
                        if hasattr(self.optimizer, 'param_group'):
                            # decay lr, and restore from previously best checkpoint
                            lr = self.optimizer.param_groups[0]['lr'] * self.hparams.lr_decay
                            print('load previously best model and decay learning rate to %f' % lr)

                            params = torch.load(self.hparams.model_save_path, map_location=lambda storage, loc: storage)
                            self.model.load_state_dict(params['state_dict'])
                            self.model = self.model.to(self.device)

                            print('restore parameters of the optimizers')
                            self.optimizer.load_state_dict(torch.load(self.hparams.optimizer_save_path))

                            # set new lr
                            for param_group in self.optimizer.param_groups:
                                param_group['lr'] = lr

                        # reset patience
                        patience = 0
                print('*' * 20, 'end validate', '*' * 20)
        print('*' * 20, 'end train', '*' * 20)
Esempio n. 22
0
def init_model(vocab_sizes, use_cuda):
    nmt = NMT(vocab_sizes, use_cuda)

    model.load_state("intermediate_ds_new", nmt)  # TODO Remove this
    #model.load_state(os.path.join('data', 'model.param'), nmt, generator)  # TODO Uncomment this
    return nmt
Esempio n. 23
0
    else:
        fname = './utils/data100000.pkl'
    train_x, train_y, dev_x, dev_y, test_x, test_y, \
    train_dict, test_dict, w2v_train, w2v_test = Load_data(fname)
    num2word = reverse(test_dict)
    dim_in = len(train_dict)
    dim_out = len(test_dict)
    print('dataset load done.')
    # load dataset

    if use_w2v:
        w2v_train = pickle.load(open('./utils/w2vtrain.pkl', 'rb'))
        w2v_test = pickle.load(open('./utils/w2vtest.pkl', 'rb'))
        model = NMT(dim_in,
                    dim_out,
                    w2v_train=w2v_train,
                    w2v_test=w2v_test,
                    ues_attention=attention,
                    ratio=teach_force).to(device)
    else:
        model = NMT(dim_in,
                    dim_out,
                    ues_attention=attention,
                    ratio=teach_force).to(device)

    optimizer = Adam(model.parameters(), lr=LR, weight_decay=1e-4)
    weight = Calc_P(train_y, dim_out)
    Loss_fn = MyLoss(weight).to(device)
    timer = Timer(epoch_size)
    init_output_log(save_dir)

    print('model load done.')
Esempio n. 24
0
def main(options):

    use_cuda = (len(options.gpuid) >= 1)
    if options.gpuid:
        cuda.set_device(options.gpuid[0])

    src_train, src_dev, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    trg_train, trg_dev, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))
    """src_train = get_lm_input(src_train)
  src_dev = get_lm_input(src_dev)
  src_test = get_lm_input(src_test)

  trg_train = get_lm_output(trg_train)
  trg_dev = get_lm_output(trg_dev)
  trg_test = get_lm_output(trg_test)"""

    batched_train_src, batched_train_src_mask, sort_index = tensor.advanced_batchize(
        src_train, options.batch_size, src_vocab.stoi["<blank>"])
    batched_train_trg, batched_train_trg_mask = tensor.advanced_batchize_no_sort(
        trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)
    batched_dev_src, batched_dev_src_mask, sort_index = tensor.advanced_batchize(
        src_dev, options.batch_size, src_vocab.stoi["<blank>"])
    batched_dev_trg, batched_dev_trg_mask = tensor.advanced_batchize_no_sort(
        trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index)

    (src_vocab_size, trg_vocab_size) = len(src_vocab), len(trg_vocab)

    nmt = NMT((src_vocab_size, trg_vocab_size),
              use_cuda)  # TODO: add more arguments as necessary
    #nmt = init_model((src_vocab_size, trg_vocab_size), use_cuda)
    if use_cuda > 0:
        nmt.cuda()
    else:
        nmt.cpu()

    criterion = torch.nn.NLLLoss()
    optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(),
                                                         options.learning_rate)

    # main training loop
    last_dev_avg_loss = float("inf")
    for epoch_i in range(options.epochs):
        logging.info("At {0}-th epoch.".format(epoch_i))
        # srange generates a lazy sequence of shuffled range
        for i, batch_i in enumerate(rand.srange(len(batched_train_src))):
            #if random.random() > 0.5:
            if False:  #i > 1500:  # TODO REMOVE THIS !!!!!!!!!!!!!!!!!!!!
                #model.save("intermediate_ds_new", nmt);
                break
            if i % 200 == 0:
                model.save("intermediate_ds", nmt)
                pass

            train_src_batch = Variable(batched_train_src[batch_i]
                                       )  # of size (src_seq_len, batch_size)
            train_trg_batch = Variable(batched_train_trg[batch_i]
                                       )  # of size (src_seq_len, batch_size)
            train_src_mask = Variable(batched_train_src_mask[batch_i])
            train_trg_mask = Variable(batched_train_trg_mask[batch_i])
            if use_cuda:
                train_src_batch = train_src_batch.cuda()
                train_trg_batch = train_trg_batch.cuda()
                train_src_mask = train_src_mask.cuda()
                train_trg_mask = train_trg_mask.cuda()

            sys_out_batch, translated_sentence_wd_index = nmt(
                train_src_batch, train_trg_batch
            )  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary
            train_trg_mask = train_trg_mask.view(-1)
            train_trg_batch = train_trg_batch.view(-1)
            train_trg_batch = train_trg_batch.masked_select(train_trg_mask)
            train_trg_mask = train_trg_mask.unsqueeze(1).expand(
                len(train_trg_mask), trg_vocab_size)
            sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
            sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(
                -1, trg_vocab_size)
            loss = criterion(sys_out_batch, train_trg_batch)
            logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation -- this is a crude esitmation because there might be some paddings at the end
        dev_loss = 0.0
        for batch_i in range(len(batched_dev_src)):
            #if random.random() > 0.5:
            if False:  #i > 1500:  # TODO REMOVE THIS !!!!!!!!!!!!!!!!!!!!
                #model.save("intermediate_ds_new", nmt);
                break
            dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True)
            dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True)
            dev_src_mask = Variable(batched_dev_src_mask[batch_i],
                                    volatile=True)
            dev_trg_mask = Variable(batched_dev_trg_mask[batch_i],
                                    volatile=True)
            if use_cuda:
                dev_src_batch = dev_src_batch.cuda()
                dev_trg_batch = dev_trg_batch.cuda()
                dev_src_mask = dev_src_mask.cuda()
                dev_trg_mask = dev_trg_mask.cuda()

            sys_out_batch, translated_sentence_wd_index = nmt(
                dev_src_batch
            )  # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary
            actual_trg_max_len = dev_trg_mask.data.shape[0]
            dev_trg_mask = dev_trg_mask.view(-1)
            dev_trg_batch = dev_trg_batch.view(-1)
            dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask)
            dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(
                len(dev_trg_mask), trg_vocab_size)

            # TODO Remove this!!!!!!
            predicted_trg_max_len = sys_out_batch.data.shape[0]
            if actual_trg_max_len > predicted_trg_max_len:
                sys_out_batch = torch.cat(
                    (sys_out_batch,
                     torch.ones((actual_trg_max_len - predicted_trg_max_len,
                                 options.batch_size, trg_vocab_size))))
            else:
                sys_out_batch = sys_out_batch[0:actual_trg_max_len]
            # TODO Remove this ^^^ !!!!!!

            sys_out_batch = sys_out_batch.view(-1, trg_vocab_size)
            sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(
                -1, trg_vocab_size)
            loss = criterion(sys_out_batch, dev_trg_batch)
            logging.debug("dev loss at batch {0}: {1}".format(
                batch_i, loss.data[0]))
            dev_loss += loss
        #if True: break
        dev_avg_loss = dev_loss / len(batched_dev_src)
        logging.info(
            "Average loss value per instance is {0} at the end of epoch {1}".
            format(dev_avg_loss.data[0], epoch_i))

        if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
            logging.info(
                "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})"
                .format(epoch_i, last_dev_avg_loss.data[0],
                        dev_avg_loss.data[0]))
            break
        torch.save(
            nmt,
            open(
                options.model_file +
                ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i),
                'wb'),
            pickle_module=dill)
        last_dev_avg_loss = dev_avg_loss

    import datetime
    current_dt_tm = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    model.save("%s_%s" % (options.model_file, current_dt_tm), nmt)
Esempio n. 25
0
def train(mode, checkpoint_path):
    # Data
    data_train = IWSLT15EnViDataSet(en_path="../data/train-en-vi/train.en",
                                    vi_path="../data/train-en-vi/train.vi")
    data_loader = DataLoader(data_train,
                             batch_size=BATCH_SIZE,
                             shuffle=False,
                             drop_last=False)
    if mode == EN2VI:
        src_vocab_size, tgt_vocab_size = data_train.en_vocab_size, data_train.vi_vocab_size
    else:
        src_vocab_size, tgt_vocab_size = data_train.vi_vocab_size, data_train.en_vocab_size
    print("Loading data done!")

    # Model & Optimizer
    model = NMT(mode=mode,
                src_vocab_size=src_vocab_size,
                tgt_vocab_size=tgt_vocab_size)
    model.to(device)

    criterion = MaskedPaddingCrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters())

    prev_epoch = 0
    if checkpoint_path.exists():  # Resume training
        model, optimizer, prev_epoch = load_checkpoint(model, optimizer,
                                                       checkpoint_path)
        print(f"Resume training from {prev_epoch} epochs!")
    else:
        model.apply(xavier_init_weights)
        print("Training from start!")

    model.train()
    for epoch in range(N_EPOCHS - prev_epoch):
        print(f"\nEpoch: {epoch+prev_epoch+1}")

        for b, (en_tokens, en_valid_len, vi_tokens,
                vi_valid_len) in enumerate(data_loader):
            en_tokens, vi_tokens = en_tokens.to(device), vi_tokens.to(device)
            en_valid_len, vi_valid_len = en_valid_len.to(
                device), vi_valid_len.to(device)

            en_padding_masks = mask_padding(en_tokens, en_valid_len, device)
            vi_padding_masks = mask_padding(vi_tokens, vi_valid_len, device)

            if mode == EN2VI:
                src, tgt = en_tokens, vi_tokens
                tgt_valid_len = vi_valid_len
                src_masks, tgt_masks = en_padding_masks, vi_padding_masks
            else:
                src, tgt = vi_tokens, en_tokens
                tgt_valid_len = en_valid_len
                src_masks, tgt_masks = vi_padding_masks, en_padding_masks

            optimizer.zero_grad()

            # Encoder's forward pass:
            encoder_state = model.encoder(src, src_masks)
            # Decoder's forward pass
            decoder_X = torch.tensor([[DEFAULT_SOS_INDEX] * tgt.shape[0]],
                                     device=device).reshape(-1, 1)
            decoder_state = encoder_state

            loss = torch.tensor(0, device=device, dtype=torch.float)
            for i in range(1, tgt.shape[1]):
                decoder_state, logit_pred = model.decoder(
                    decoder_X, decoder_state)
                loss += criterion(pred=logit_pred[:, 0, :],
                                  label=tgt[:, i],
                                  device=device).sum()
                # Teacher forcing
                decoder_X = tgt[:, i].reshape(-1, 1)

            loss.backward()
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            if b % 50 == 0:
                seq_loss = loss / (MAX_LENGTH - 1)
                print(f"\tBatch {b}; Loss: {seq_loss:.2f}; "
                      f"Mean Token Loss: {seq_loss/tgt_valid_len.sum():.4f}")

            ## Free up GPU memory
            del src, tgt, en_valid_len, vi_valid_len, decoder_state, logit_pred, loss
            torch.cuda.empty_cache()

        save_checkpoint(mode, src_vocab_size, tgt_vocab_size, model, optimizer,
                        data_train.tokenizer_en, data_train.tokenizer_vi,
                        prev_epoch + epoch + 1, checkpoint_path)

        for en in ens:
            vi = translate_en2vi(en_sentence=en,
                                 length=MAX_LENGTH,
                                 model=model,
                                 tokenizer_en=data_train.tokenizer_en,
                                 tokenizer_vi=data_train.tokenizer_vi,
                                 device=device)
            print("en:", en, "=> vi:", vi)
Esempio n. 26
0
def main(options):

    original_model = torch.load(open(options.original_model_file, 'rb'))

    nmt = NMT(original_model)

    # use_cuda = (len(options.gpuid) >= 1)
    # if options.gpuid:
    #   cuda.set_device(options.gpuid[0])

    src_train, src_dev, src_test, src_vocab = torch.load(
        open(options.data_file + "." + options.src_lang, 'rb'))
    trg_train, trg_dev, trg_test, trg_vocab = torch.load(
        open(options.data_file + "." + options.trg_lang, 'rb'))

    batched_test_src, batched_test_src_mask, _ = utils.tensor.advanced_batchize(
        src_test, 24, src_vocab.stoi["<pad>"])
    batched_test_trg, batched_test_trg_mask, _ = utils.tensor.advanced_batchize(
        trg_test, 24, trg_vocab.stoi["<pad>"])

    # total_loss = 0
    # total_sent = 0

    # nmt(Variable(torch.from_numpy(np.array([[46, 68], [470, 72], [30, 4]]))),Variable(torch.from_numpy(np.array([[1],[1]]))))
    # sys.exit(0)
    # print(torch.min(src_test))
    # print(torch.max(trg_test))
    for i, batch_i in enumerate(utils.rand.srange(len(batched_test_src))):
        print(i)
        test_src_batch = Variable(
            batched_test_src[batch_i],
            volatile=True)  # of size (src_seq_len, batch_size)
        test_trg_batch = Variable(
            batched_test_trg[batch_i],
            volatile=True)  # of size (src_seq_len, batch_size)
        test_src_mask = Variable(batched_test_src_mask[batch_i], volatile=True)
        test_trg_mask = Variable(batched_test_trg_mask[batch_i], volatile=True)

        total_sent += test_src_batch.size()[0]

        if use_cuda:
            test_src_batch = test_src_batch.cuda()
            test_trg_batch = test_trg_batch.cuda()
            test_src_mask = test_src_mask.cuda()
            test_trg_mask = test_trg_mask.cuda()
        # print(torch.min(test_src_batch))
        # print(torch.max(test_src_batch))
        # print(test_src_batch)

        sys_out_batch = nmt(test_src_batch, test_trg_batch.size()[0])
        test_trg_mask = test_trg_mask.view(-1)
        test_trg_batch = test_trg_batch.view(-1)
        test_trg_batch = test_trg_batch.masked_select(test_trg_mask)
        test_trg_mask = test_trg_mask.unsqueeze(1).expand(
            len(test_trg_mask), trg_vocab_size - 1)

        sys_out_batch = sys_out_batch.view(-1, trg_vocab_size - 1)
        sys_out_batch = sys_out_batch.masked_select(test_trg_mask).view(
            -1, trg_vocab_size - 1)

        loss = criterion(sys_out_batch, test_trg_batch)
        logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))

        total_loss += loss
        # break
        # _, sys_out_batch = torch.max(sys_out_batch, dim=2)
        # sys_out_batch = sys_out_batch.view(-1)
        # sent = []
        # # print(sys_out_batch)
        # for w in sys_out_batch:
        #   # print(w)
        #   sent.append(trg_vocab.itos[w.data[0]])
        # # print(sent)
        # # print(sent.join())
        # print(' '.join(sent).encode('utf-8').strip())
    print(total_loss, total_sent)
    print(total_loss / total_sent)
    print(torch.exp(total_loss / total_sent))
    sys.exit(0)