Exemple #1
0
    def setUp(self):
        # Seed the Random Number Generators
        seed = 1234
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        np.random.seed(seed * 13 // 7)

        # Load training data & vocabulary
        train_data_src = submission.read_corpus(
            './sanity_check_en_es_data/train_sanity_check.es', 'src')
        train_data_tgt = submission.read_corpus(
            './sanity_check_en_es_data/train_sanity_check.en', 'tgt')
        train_data = list(zip(train_data_src, train_data_tgt))

        for src_sents, tgt_sents in submission.batch_iter(
                train_data, batch_size=BATCH_SIZE, shuffle=True):
            self.src_sents = src_sents
            self.tgt_sents = tgt_sents
            break
        self.vocab = Vocab.load(
            './sanity_check_en_es_data/vocab_sanity_check.json')

        # Create NMT Model
        self.model = submission.NMT(embed_size=EMBED_SIZE,
                                    hidden_size=HIDDEN_SIZE,
                                    dropout_rate=DROPOUT_RATE,
                                    vocab=self.vocab)
Exemple #2
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
Exemple #3
0
    def test_0(self):
        """1d-0-basic:  Sanity check for Encode.  Compares student output to that of model with dummy data."""
        # Seed the Random Number Generators
        seed = 1234
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        np.random.seed(seed * 13 // 7)

        # Load training data & vocabulary
        train_data_src = submission.read_corpus(
            './sanity_check_en_es_data/train_sanity_check.es', 'src')
        train_data_tgt = submission.read_corpus(
            './sanity_check_en_es_data/train_sanity_check.en', 'tgt')
        train_data = list(zip(train_data_src, train_data_tgt))

        for src_sents, tgt_sents in submission.batch_iter(
                train_data, batch_size=BATCH_SIZE, shuffle=True):
            src_sents = src_sents
            tgt_sents = tgt_sents
            break
        vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')

        # Create NMT Model
        model = submission.NMT(embed_size=EMBED_SIZE,
                               hidden_size=HIDDEN_SIZE,
                               dropout_rate=DROPOUT_RATE,
                               vocab=vocab)
        # Configure for Testing
        reinitialize_layers(model)
        source_lengths = [len(s) for s in src_sents]
        source_padded = model.vocab.src.to_input_tensor(src_sents,
                                                        device=model.device)

        # Load Outputs
        enc_hiddens_target = torch.load(
            './sanity_check_en_es_data/enc_hiddens.pkl')
        dec_init_state_target = torch.load(
            './sanity_check_en_es_data/dec_init_state.pkl')

        # Test
        with torch.no_grad():
            enc_hiddens_pred, dec_init_state_pred = model.encode(
                source_padded, source_lengths)
        self.assertTrue(
            np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())
        ), "enc_hiddens is incorrect: it should be:\n {} but is:\n{}".format(
            enc_hiddens_target, enc_hiddens_pred)
        print("enc_hiddens Sanity Checks Passed!")
        self.assertTrue(
            np.allclose(dec_init_state_target[0].numpy(),
                        dec_init_state_pred[0].numpy())
        ), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(
            dec_init_state_target[0], dec_init_state_pred[0])
        print("dec_init_state[0] Sanity Checks Passed!")
        self.assertTrue(
            np.allclose(dec_init_state_target[1].numpy(),
                        dec_init_state_pred[1].numpy())
        ), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(
            dec_init_state_target[1], dec_init_state_pred[1])
        print("dec_init_state[1] Sanity Checks Passed!")
Exemple #4
0
def vocab(args: Dict):
    print('read in source sentences: %s' % args['--train-src'])
    print('read in target sentences: %s' % args['--train-tgt'])

    src_sents = read_corpus(args['--train-src'], source='src')
    tgt_sents = read_corpus(args['--train-tgt'], source='tgt')

    vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff']))
    print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

    vocab.save(args['VOCAB_FILE'])
    print('vocabulary saved to %s' % args['VOCAB_FILE'])
Exemple #5
0
def setup():
    # Load training data & vocabulary
    train_data_src = submission.read_corpus(
        './sanity_check_en_es_data/train_sanity_check.es', 'src')
    train_data_tgt = submission.read_corpus(
        './sanity_check_en_es_data/train_sanity_check.en', 'tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    for src_sents, tgt_sents in submission.batch_iter(
            train_data, batch_size=LARGE_BATCH_SIZE, shuffle=True):
        src_sents = src_sents
        tgt_sents = tgt_sents
        break
    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')
    return src_sents, tgt_sents, vocab
Exemple #6
0
def bleu(args: Dict[str, str]):
    """ computes belu score
    @param args (Dict): args for file path details
    """

    test_data_out = submission.read_corpus(args['TEST_OUTPUT_FILE'],
                                           source='tgt')
    test_data_gold = submission.read_corpus(args['TEST_GOLD_FILE'],
                                            source='tgt')
    min_len = min(len(test_data_out), len(test_data_gold))

    bleu_score = corpus_bleu([[ref] for ref in test_data_gold[:min_len]],
                             [hyp for hyp in test_data_out[:min_len]])
    print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    return bleu_score * 100
Exemple #7
0
def decode(args: Dict[str, str]):
    """ Performs decoding on the autograder test set
    Make sure to run this code before submitting the code to the autograder
    @param args (Dict): args from cmd line
    """

    test_data_src = read_corpus(args['SOURCE_FILE'], source='src')
    model = NMT.load(args['MODEL_PATH'])

    if args['CUDA']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=int(args['BEAM_SIZE']),
                             max_decoding_time_step=int(
                                 args['MAX_DECODING_TIME_STEP']))

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
Exemple #8
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab, no_char_decoder=args['--no-char-decoder'])
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(
                                                                                             report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (
                                                                                                 time.time() - train_time),
                                                                                         time.time() - begin_time),
                      file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                             cum_loss / cum_examples,
                                                                                             np.exp(
                                                                                                 cum_loss / cum_tgt_words),
                                                                                             cum_examples),
                      file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                        # load model
                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers', file=sys.stderr)
                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)
Exemple #9
0
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(
            self.src), len(self.tgt))


if __name__ == '__main__':
    args = docopt(__doc__)

    print('read in source sentences: %s' % args['--train-src'])
    print('read in target sentences: %s' % args['--train-tgt'])

    src_sents = read_corpus(args['--train-src'], source='src')
    tgt_sents = read_corpus(args['--train-tgt'], source='tgt')

    vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']),
                        int(args['--freq-cutoff']))
    print('generated vocabulary, source %d words, target %d words' %
          (len(vocab.src), len(vocab.tgt)))

    vocab.save(args['VOCAB_FILE'])
    print('vocabulary saved to %s' % args['VOCAB_FILE'])
Exemple #10
0
    def test_0(self):
        """1f-0-basic:  Sanity check for Step.  Compares student output to that of model with dummy data."""
        # Seed the Random Number Generators
        seed = 1234
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        np.random.seed(seed * 13 // 7)

        # Load training data & vocabulary
        train_data_src = submission.read_corpus(
            './sanity_check_en_es_data/train_sanity_check.es', 'src')
        train_data_tgt = submission.read_corpus(
            './sanity_check_en_es_data/train_sanity_check.en', 'tgt')
        train_data = list(zip(train_data_src, train_data_tgt))

        for src_sents, tgt_sents in submission.batch_iter(
                train_data, batch_size=BATCH_SIZE, shuffle=True):
            self.src_sents = src_sents
            self.tgt_sents = tgt_sents
            break
        self.vocab = Vocab.load(
            './sanity_check_en_es_data/vocab_sanity_check.json')

        # Create NMT Model
        self.model = submission.NMT(embed_size=EMBED_SIZE,
                                    hidden_size=HIDDEN_SIZE,
                                    dropout_rate=DROPOUT_RATE,
                                    vocab=self.vocab)

        reinitialize_layers(self.model)
        # Inputs
        Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl')
        dec_init_state = torch.load(
            './sanity_check_en_es_data/dec_init_state.pkl')
        enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
        enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl')
        enc_hiddens_proj = torch.load(
            './sanity_check_en_es_data/enc_hiddens_proj.pkl')

        # Output
        dec_state_target = torch.load(
            './sanity_check_en_es_data/dec_state.pkl')
        o_t_target = torch.load('./sanity_check_en_es_data/o_t.pkl')
        e_t_target = torch.load('./sanity_check_en_es_data/e_t.pkl')

        # Run Tests
        with torch.no_grad():
            dec_state_pred, o_t_pred, e_t_pred = self.model.step(
                Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj,
                enc_masks)
        self.assertTrue(
            np.allclose(dec_state_target[0].numpy(),
                        dec_state_pred[0].numpy()),
            "decoder_state[0] should be:\n {} but is:\n{}".format(
                dec_state_target[0], dec_state_pred[0]))
        print("dec_state[0] Sanity Checks Passed!")
        self.assertTrue(
            np.allclose(dec_state_target[1].numpy(),
                        dec_state_pred[1].numpy()),
            "decoder_state[1] should be:\n {} but is:\n{}".format(
                dec_state_target[1], dec_state_pred[1]))
        print("dec_state[1] Sanity Checks Passed!")
        self.assertTrue(
            np.allclose(o_t_target.numpy(), o_t_pred.numpy()),
            "combined_output should be:\n {} but is:\n{}".format(
                o_t_target, o_t_pred))
        print("combined_output  Sanity Checks Passed!")
        self.assertTrue(
            np.allclose(e_t_target.numpy(), e_t_pred.numpy()),
            "e_t should be:\n {} but is:\n{}".format(e_t_target, e_t_pred))