Example #1
0
def sample_ngram_adapt(args):
    src_sents = read_corpus(args.src, 'src')
    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
    f_out = open(args.output, 'w')

    vocab = torch.load(args.vocab)
    tgt_vocab = vocab.tgt

    max_len = max([len(tgt_sent) for tgt_sent in tgt_sents]) + 1

    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
        src_sent = ' '.join(src_sent)

        tgt_len = len(tgt_sent)
        tgt_samples = []

        # generate 100 samples

        # append itself
        tgt_samples.append(tgt_sent)

        for sid in range(args.sample_size - 1):
            max_n = min(tgt_len - 1, 4)
            bias_n = int(max_n * tgt_len / max_len) + 1
            assert 1 <= bias_n <= 4, 'bias_n={}, not in [1,4], max_n={}, tgt_len={}, max_len={}'.format(bias_n, max_n, tgt_len, max_len)

            p = [1.0 / (max_n + 5)] * max_n
            p[bias_n - 1] = 1 - p[0] * (max_n - 1)
            assert abs(sum(p) - 1) < 1e-10, 'sum(p) != 1'

            n = np.random.choice(np.arange(1, int(max_n + 1)), p=p)  # we do not replace the last token: it must be a period!
            assert n < tgt_len, 'n={}, tgt_len={}'.format(n, tgt_len)

            idx = np.random.randint(tgt_len - n)
            ngram = tgt_sent[idx: idx + n]
            new_ngram = get_new_ngram(ngram, n, tgt_vocab)

            sampled_tgt_sent = list(tgt_sent)
            sampled_tgt_sent[idx: idx + n] = new_ngram

            tgt_samples.append(sampled_tgt_sent)

        # compute bleu scores and rank the samples by bleu scores
        bleu_scores = []
        for tgt_sample in tgt_samples:
            bleu_score = sentence_bleu([tgt_sent], tgt_sample)
            bleu_scores.append(bleu_score)

        tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: bleu_scores[i], reverse=True)
        # convert list of tokens into a string
        tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples]

        print('*' * 50, file=f_out)
        print('source: ' + src_sent, file=f_out)
        print('%d samples' % len(tgt_samples), file=f_out)
        for i in tgt_ranks:
            print('%s ||| %f' % (tgt_samples[i], bleu_scores[i]), file=f_out)
        print('*' * 50, file=f_out)

    f_out.close()
def sample(args):
    train_data_src = read_corpus(args.src_file, source='src')
    train_data_tgt = read_corpus(args.tgt_file, source='tgt')
    train_data = zip(train_data_src, train_data_tgt)

    # load model params
    print('load model from [%s]' % args.model_bin, file=sys.stderr)
    params = torch.load(args.model_bin,
                        map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    opt = params['args']
    state_dict = params['state_dict']

    # build model
    model = NMT(opt, vocab)
    model.load_state_dict(state_dict)
    model.eval()
    model = model.cuda()

    # sampling
    print('begin sampling')
    train_iter = cum_samples = 0
    for src_sents, tgt_sents in data_iter(train_data, batch_size=1):
        train_iter += 1
        samples = model.sample(src_sents, sample_size=5, to_word=True)
        cum_samples += sum(len(sample) for sample in samples)

        for i, tgt_sent in enumerate(tgt_sents):
            print('*' * 80)
            print('target:' + ' '.join(tgt_sent))
            tgt_samples = samples[i]
            print('samples:')
            for sid, sample in enumerate(tgt_samples, 1):
                print('[%d] %s' % (sid, ' '.join(sample[1:-1])))
            print('*' * 80)
Example #3
0
 def test_data_generator(self, batch_size=64):
     self.dev_src = "data/nmt_iwslt/test.de-en.de"
     self.dev_tgt = "data/nmt_iwslt/test.de-en.en"
     eval_data_src = read_corpus(self.dev_src, source='src')
     eval_data_tgt = read_corpus(self.dev_tgt, source='tgt')
     return data_iter(zip(eval_data_src, eval_data_tgt),
                      batch_size=batch_size)
Example #4
0
 def train_data_generator(self, batch_size=64):
     self.train_src = "data/nmt_iwslt/train.de-en.de.wmixerprep"
     self.train_tgt = "data/nmt_iwslt/train.de-en.en.wmixerprep"
     train_data_src = read_corpus(self.train_src, source='src')
     train_data_tgt = read_corpus(self.train_tgt, source='tgt')
     return data_iter(zip(train_data_src, train_data_tgt),
                      batch_size=batch_size)
Example #5
0
def test(args):
    test_data_src = read_corpus(args.test_src, source='src')
    test_data_tgt = read_corpus(args.test_tgt, source='tgt')
    test_data = list(zip(test_data_src, test_data_tgt))

    if args.load_model:
        print('load model from [%s]' % args.load_model)
        params = torch.load(args.load_model,
                            map_location=lambda storage, loc: storage)
        vocab = params['vocab']
        saved_args = params['args']
        state_dict = params['state_dict']

        model = NMT(saved_args, vocab)
        model.load_state_dict(state_dict)
    else:
        vocab = torch.load(args.vocab)
        model = NMT(args, vocab)

    model.eval()

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()

    hypotheses = decode(model, test_data)
    top_hypotheses = [hyps[0] for hyps in hypotheses]

    bleu_score = get_bleu([tgt for src, tgt in test_data], top_hypotheses)
    word_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses,
                       'word_acc')
    sent_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses,
                       'sent_acc')
    print('Corpus Level BLEU: %f, word level acc: %f, sentence level acc: %f' %
          (bleu_score, word_acc, sent_acc),
          file=sys.stderr)

    if args.save_to_file:
        print('save decoding results to %s' % args.save_to_file)
        with open(args.save_to_file, 'w') as f:
            for hyps in hypotheses:
                f.write(' '.join(hyps[0][1:-1]) + '\n')

        if args.save_nbest:
            nbest_file = args.save_to_file + '.nbest'
            print('save nbest decoding results to %s' % nbest_file)
            with open(nbest_file, 'w') as f:
                for src_sent, tgt_sent, hyps in zip(test_data_src,
                                                    test_data_tgt, hypotheses):
                    print('Source: %s' % ' '.join(src_sent), file=f)
                    print('Target: %s' % ' '.join(tgt_sent), file=f)
                    print('Hypotheses:', file=f)
                    for i, hyp in enumerate(hyps, 1):
                        print('[%d] %s' % (i, ' '.join(hyp)), file=f)
                    print('*' * 30, file=f)
Example #6
0
def sample_ngram(args):
    src_sents = read_corpus(args.src, 'src')
    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
    f_out = open(args.output, 'w')

    vocab = torch.load(args.vocab)
    tgt_vocab = vocab.tgt

    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
        src_sent = ' '.join(src_sent)

        tgt_len = len(tgt_sent)
        tgt_samples = []

        # generate 100 samples

        # append itself
        tgt_samples.append(tgt_sent)

        for sid in xrange(args.sample_size - 1):
            n = np.random.randint(1, min(
                tgt_len,
                5))  # we do not replace the last token: it must be a period!

            idx = np.random.randint(tgt_len - n)
            ngram = tgt_sent[idx:idx + n]
            new_ngram = get_new_ngram(ngram, n, tgt_vocab)

            sampled_tgt_sent = list(tgt_sent)
            sampled_tgt_sent[idx:idx + n] = new_ngram

            tgt_samples.append(sampled_tgt_sent)

        # compute bleu scores and rank the samples by bleu scores
        bleu_scores = []
        for tgt_sample in tgt_samples:
            bleu_score = sentence_bleu([tgt_sent], tgt_sample)
            bleu_scores.append(bleu_score)

        tgt_ranks = sorted(range(len(tgt_samples)),
                           key=lambda i: bleu_scores[i],
                           reverse=True)
        # convert list of tokens into a string
        tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples]

        print('*' * 50, file=f_out)
        print('source: ' + src_sent, file=f_out)
        print('%d samples' % len(tgt_samples), file=f_out)
        for i in tgt_ranks:
            print('%s ||| %f' % (tgt_samples[i], bleu_scores[i]), file=f_out)
        print('*' * 50, file=f_out)

    f_out.close()
Example #7
0
def sample(args):
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')
    train_data = zip(train_data_src, train_data_tgt)

    if args.load_model:
        print('load model from [%s]' % args.load_model)
        params = torch.load(args.load_model,
                            map_location=lambda storage, loc: storage)
        vocab = params['vocab']
        opt = params['args']
        state_dict = params['state_dict']

        model = NMT(opt, vocab)
        model.load_state_dict(state_dict)
    else:
        vocab = torch.load(args.vocab)
        model = NMT(args, vocab)

    model.eval()

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()

    print('begin sampling')

    check_every = 10
    train_iter = cum_samples = 0
    train_time = time.time()
    for src_sents, tgt_sents in data_iter(train_data,
                                          batch_size=args.batch_size):
        train_iter += 1
        samples = model.sample(src_sents,
                               sample_size=args.sample_size,
                               to_word=True)
        cum_samples += sum(len(sample) for sample in samples)

        if train_iter % check_every == 0:
            elapsed = time.time() - train_time
            print('sampling speed: %d/s' % (cum_samples / elapsed))
            cum_samples = 0
            train_time = time.time()

        for i, tgt_sent in enumerate(tgt_sents):
            print('*' * 80)
            print('target:' + ' '.join(tgt_sent))
            tgt_samples = samples[i]
            print('samples:')
            for sid, sample in enumerate(tgt_samples, 1):
                print('[%d] %s' % (sid, ' '.join(sample[1:-1])))
            print('*' * 80)
Example #8
0
def main():
    ###############################################################################
    # Load data
    ###############################################################################
    d = util.Dictionary()
    if args.task == "train":
        logging.info("Reading train...")
        trncorpus = util.read_corpus(args.ftrn, d, True)
        d.freeze()  # no new word types allowed
        vocab_size = d.size()
        # save dict
        d.save_dict(fprefix + ".dict")
        logging.info("Reading dev...")
        devcorpus = util.read_corpus(args.fdev, d, False)

    elif args.task == "test":
        logging.info("Reading test...")
        d.load_dict(args.fdct)
        d.freeze()
        vocab_size = d.size()
        # load test corpus
        tstcorpus = util.read_corpus(args.ftst, d, False)

    ###############################################################################
    # Build the model
    ###############################################################################
    if args.task == "train":
        model_fname = fprefix + ".model"
        pretrained_model = None
        if args.fmod:
            # load pre-trained model
            pretrained_model = model.load_model(args.fmod, vocab_size,
                                                args.nclass, args.inputdim,
                                                args.hiddendim, args.nlayer,
                                                args.droprate)
            logging.info("Successfully loaded pretrained model.")

        trained = model.train(trncorpus, devcorpus, vocab_size, args.nclass,
                              args.inputdim, args.hiddendim, args.nlayer,
                              args.trainer, args.lr, args.droprate, args.niter,
                              args.logfreq, args.verbose, model_fname,
                              pretrained_model)
        dev_accuracy = model.evaluate(trained, devcorpus.docs)
        logging.info("Final Accuracy on dev: %s", dev_accuracy)
        model.save_model(trained, model_fname)

    else:
        trained_model = model.load_model(args.fmod, vocab_size, args.nclass,
                                         args.inputdim, args.hiddendim,
                                         args.nlayer, args.droprate)
        tst_accuracy = model.evaluate(trained_model, tstcorpus.docs)
        logging.info("Final Accuracy on test: %s", tst_accuracy)
Example #9
0
def sample_from_hamming_distance_payoff_distribution(args):
    src_sents = read_corpus(args.src, 'src')
    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
    f_out = open(args.output, 'w')

    vocab = torch.load(args.vocab)
    tgt_vocab = vocab.tgt

    payoff_prob, Z_qs = generate_hamming_distance_payoff_distribution(
        max(len(sent) for sent in tgt_sents),
        vocab_size=len(vocab.tgt),
        tau=args.temp)

    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
        tgt_samples = []  # make sure the ground truth y* is in the samples
        tgt_sent_len = len(
            tgt_sent) - 3  # remove <s> and </s> and ending period .
        tgt_ref_tokens = tgt_sent[1:-1]
        bleu_scores = []

        # sample an edit distances
        e_samples = np.random.choice(range(tgt_sent_len + 1),
                                     p=payoff_prob[tgt_sent_len],
                                     size=args.sample_size,
                                     replace=True)

        for i, e in enumerate(e_samples):
            if e > 0:
                # sample a new tgt_sent $y$
                old_word_pos = np.random.choice(range(1, tgt_sent_len + 1),
                                                size=e,
                                                replace=False)
                new_words = [
                    vocab.tgt.id2word[wid]
                    for wid in np.random.randint(3, len(vocab.tgt), size=e)
                ]
                new_tgt_sent = list(tgt_sent)
                for pos, word in zip(old_word_pos, new_words):
                    new_tgt_sent[pos] = word

                bleu_score = sentence_bleu([tgt_ref_tokens],
                                           new_tgt_sent[1:-1])
                bleu_scores.append(bleu_score)
            else:
                new_tgt_sent = list(tgt_sent)
                bleu_scores.append(1.)

            # print('y: %s' % ' '.join(new_tgt_sent))
            tgt_samples.append(new_tgt_sent)
Example #10
0
def load_data(path, split, suffix, skip_gap, feature_path, is_test=False):
    print('load data from {}'.format(path))
    if path is None:
        return None
    slist = suffix.split(' ')   # src mt src-mt.alignments tags pe ref src_tags features
    src_sents = read_corpus(path + '/%s.%s' % (split, slist[0]))
    hyp_sents_orig = read_corpus(path + '/%s.%s' % (split, slist[1]), lowercase=False)
    hyp_sents = [[w.lower() for w in hyp] for hyp in hyp_sents_orig]
    align_sents = read_alignment_matrix(path + '/%s.%s' % (split, slist[2]), src_sents, hyp_sents)
 
    if is_test:
        tag_sents = [[1] * len(hyp) for hyp in hyp_sents]
    else:
        tag_sents = read_tags(path + '/%s.%s' % (split, slist[3]), skip_gap)
    baseline_sents = read_baseline_features(feature_path + '/%s.%s' %(split, slist[7]))
    #baseline_sents = [[0 for h in hyp] for hyp in hyp_sents]
    return list(zip(src_sents, hyp_sents, align_sents, tag_sents, baseline_sents, hyp_sents_orig))
Example #11
0
def train_raml(args):
    tau = args.temp

    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tgt, source='tgt')
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    dev_data = dev_data[:args.dev_limit]

    vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args)

    if args.raml_sample_mode == 'pre_sample':
        # dict of (src, [tgt: (sent, prob)])
        print('read in raml training data...', file=sys.stderr, end='')
        begin_time = time.time()
        raml_samples = read_raml_train_data(args.raml_sample_file, temp=tau)
        print('done[%d s].' % (time.time() - begin_time))
    elif args.raml_sample_mode.startswith('hamming_distance'):
        print('sample from hamming distance payoff distribution')
        payoff_prob, Z_qs = generate_hamming_distance_payoff_distribution(
            max(len(sent) for sent in train_data_tgt),
            vocab_size=len(vocab.tgt) - 3,
            tau=tau)

    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    report_weighted_loss = cum_weighted_loss = 0
    cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    _info = f"""
        begin RAML training
        ・学習:{len(train_data)}ペア, {args.train_log_file}
        ・テスト:{len(dev_data)}ペア, {args.valid_niter}iter毎 {args.validation_log_file}
        ・バッチサイズ:{args.batch_size}
        ・1epoch = {len(train_data)}ペア = {int(len(train_data)/args.batch_size)}iter
        """
    print(_info)

    log_data = {'args': args}

    if args.notify_slack:
        slack.post(f"""
        {_info}
        {args}
        """)

    # smoothing function for BLEU
    sm_func = None
    if args.smooth_bleu:
        sm_func = SmoothingFunction().method3

    with open(args.train_log_file,
              "w") as train_output, open(args.validation_log_file,
                                         "w") as validation_output:

        while True:
            epoch += 1
            for src_sents, tgt_sents in data_iter(train_data,
                                                  batch_size=args.batch_size):
                train_iter += 1

                raml_src_sents = []
                raml_tgt_sents = []
                raml_tgt_weights = []

                if args.raml_sample_mode == 'pre_sample':
                    for src_sent in src_sents:
                        sent = ' '.join(src_sent)
                        tgt_samples_all = raml_samples[sent]
                        # print(f'src_sent: "{sent}", target_samples_all: {len(list(tgt_samples_all))}')
                        if args.sample_size >= len(list(tgt_samples_all)):
                            tgt_samples = tgt_samples_all
                        else:
                            tgt_samples_id = np.random.choice(
                                range(1, len(list(tgt_samples_all))),
                                size=args.sample_size - 1,
                                replace=False)
                            tgt_samples = [tgt_samples_all[0]] + [
                                tgt_samples_all[i] for i in tgt_samples_id
                            ]  # make sure the ground truth y* is in the samples

                        raml_src_sents.extend([src_sent] *
                                              len(list(tgt_samples)))
                        raml_tgt_sents.extend(
                            [['<s>'] + sent.split(' ') + ['</s>']
                             for sent, weight in tgt_samples])
                        raml_tgt_weights.extend(
                            [weight for sent, weight in tgt_samples])
                elif args.raml_sample_mode in [
                        'hamming_distance', 'hamming_distance_impt_sample'
                ]:
                    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
                        tgt_samples = [
                        ]  # make sure the ground truth y* is in the samples
                        tgt_sent_len = len(
                            tgt_sent
                        ) - 3  # remove <s> and </s> and ending period .
                        tgt_ref_tokens = tgt_sent[1:-1]
                        bleu_scores = []
                        # print('y*: %s' % ' '.join(tgt_sent))
                        # sample an edit distances
                        e_samples = np.random.choice(
                            range(tgt_sent_len + 1),
                            p=payoff_prob[tgt_sent_len],
                            size=args.sample_size,
                            replace=True)

                        # make sure the ground truth y* is in the samples
                        if args.raml_bias_groundtruth and (not 0 in e_samples):
                            e_samples[0] = 0

                        for i, e in enumerate(e_samples):
                            if e > 0:
                                # sample a new tgt_sent $y$
                                old_word_pos = np.random.choice(range(
                                    1, tgt_sent_len + 1),
                                                                size=e,
                                                                replace=False)
                                new_words = [
                                    vocab.tgt.id2word[wid]
                                    for wid in np.random.randint(
                                        3, len(vocab.tgt), size=e)
                                ]
                                new_tgt_sent = list(tgt_sent)
                                for pos, word in zip(old_word_pos, new_words):
                                    new_tgt_sent[pos] = word
                            else:
                                new_tgt_sent = list(tgt_sent)

                            # if enable importance sampling, compute bleu score
                            if args.raml_sample_mode == 'hamming_distance_impt_sample':
                                if e > 0:
                                    # remove <s> and </s>
                                    bleu_score = sentence_bleu(
                                        [tgt_ref_tokens],
                                        new_tgt_sent[1:-1],
                                        smoothing_function=sm_func)
                                    bleu_scores.append(bleu_score)
                                else:
                                    bleu_scores.append(1.)

                            # print('y: %s' % ' '.join(new_tgt_sent))
                            tgt_samples.append(new_tgt_sent)

                        # if enable importance sampling, compute importance weight
                        if args.raml_sample_mode == 'hamming_distance_impt_sample':
                            tgt_sample_weights = [
                                math.exp(bleu_score / tau) / math.exp(-e / tau)
                                for e, bleu_score in zip(
                                    e_samples, bleu_scores)
                            ]
                            normalizer = sum(tgt_sample_weights)
                            tgt_sample_weights = [
                                w / normalizer for w in tgt_sample_weights
                            ]
                        else:
                            tgt_sample_weights = [1.] * args.sample_size

                        raml_src_sents.extend([src_sent] * len(tgt_samples))
                        raml_tgt_sents.extend(tgt_samples)
                        raml_tgt_weights.extend(tgt_sample_weights)

                        if args.debug:
                            print('*' * 30)
                            print('Target: %s' % ' '.join(tgt_sent))
                            for tgt_sample, e, bleu_score, weight in zip(
                                    tgt_samples, e_samples, bleu_scores,
                                    tgt_sample_weights):
                                print(
                                    'Sample: %s ||| e: %d ||| bleu: %f ||| weight: %f'
                                    % (' '.join(tgt_sample), e, bleu_score,
                                       weight))
                            print()
                            break

                src_sents_var = to_input_variable(raml_src_sents,
                                                  vocab.src,
                                                  cuda=args.cuda)
                tgt_sents_var = to_input_variable(raml_tgt_sents,
                                                  vocab.tgt,
                                                  cuda=args.cuda)
                weights_var = Variable(torch.FloatTensor(raml_tgt_weights),
                                       requires_grad=False)
                if args.cuda:
                    weights_var = weights_var.cuda()

                batch_size = len(
                    raml_src_sents
                )  # batch_size = args.batch_size * args.sample_size
                src_sents_len = [len(s) for s in raml_src_sents]
                pred_tgt_word_num = sum(len(
                    s[1:]) for s in raml_tgt_sents)  # omitting leading `<s>`
                optimizer.zero_grad()

                # (tgt_sent_len, batch_size, tgt_vocab_size)
                scores = model(src_sents_var, src_sents_len,
                               tgt_sents_var[:-1])
                # (tgt_sent_len * batch_size, tgt_vocab_size)
                log_scores = F.log_softmax(scores.view(-1, scores.size(2)))
                # remove leading <s> in tgt sent, which is not used as the target
                flattened_tgt_sents = tgt_sents_var[1:].view(-1)

                # batch_size * tgt_sent_len
                tgt_log_scores = torch.gather(
                    log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1)
                unweighted_loss = -tgt_log_scores * (
                    1. - torch.eq(flattened_tgt_sents, 0).float())
                weighted_loss = unweighted_loss * weights_var.repeat(
                    scores.size(0))
                weighted_loss = weighted_loss.sum()
                weighted_loss_val = weighted_loss.item()
                nll_loss_val = unweighted_loss.sum().item()
                # weighted_log_scores = log_scores * weights.view(-1, scores.size(2))
                # weighted_loss = nll_loss(weighted_log_scores, flattened_tgt_sents)

                loss = weighted_loss / batch_size
                # nll_loss_val = nll_loss(log_scores, flattened_tgt_sents).item()

                loss.backward()
                # clip gradient
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.clip_grad)
                optimizer.step()

                report_weighted_loss += weighted_loss_val
                cum_weighted_loss += weighted_loss_val
                report_loss += nll_loss_val
                cum_loss += nll_loss_val
                report_tgt_words += pred_tgt_word_num
                cum_tgt_words += pred_tgt_word_num
                report_examples += batch_size
                cum_examples += batch_size
                cum_batches += batch_size

                if train_iter % args.log_every == 0 or train_iter % args.notify_slack_every == 0:
                    _log = 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (
                        epoch, train_iter,
                        report_weighted_loss / report_examples,
                        np.exp(report_loss / report_tgt_words), cum_examples,
                        report_tgt_words /
                        (time.time() - train_time), time.time() - begin_time)
                    print(_log)
                    print(_log, file=train_output)

                    _list_dict_update(
                        log_data, {
                            'epoch': epoch,
                            'train_iter': train_iter,
                            'loss': report_weighted_loss / report_examples,
                            'ppl': np.exp(report_loss / report_tgt_words),
                            'examples': cum_examples,
                            'speed': report_tgt_words /
                            (time.time() - train_time),
                            'elapsed': time.time() - begin_time
                        }, 'train')

                    train_time = time.time()
                    report_loss = report_weighted_loss = report_tgt_words = report_examples = 0.
                    if train_iter % args.notify_slack_every == 0 and args.notify_slack:
                        print('post slack')
                        slack.post(_log)

                # perform validation
                if train_iter % args.valid_niter == 0:
                    print('epoch %d, iter %d, cum. loss %.2f, '
                          'cum. ppl %.2f cum. examples %d' %
                          (epoch, train_iter, cum_weighted_loss / cum_batches,
                           np.exp(cum_loss / cum_tgt_words), cum_examples),
                          file=sys.stderr)

                    cum_loss = cum_weighted_loss = cum_batches = cum_tgt_words = 0.
                    valid_num += 1

                    print('begin validation ...')
                    model.eval()

                    # compute dev. ppl and bleu

                    dev_loss = evaluate_loss(model, dev_data,
                                             cross_entropy_loss)
                    dev_ppl = np.exp(dev_loss)

                    if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']:
                        dev_hyps = decode(model,
                                          dev_data,
                                          f=validation_output,
                                          verbose=False)
                        dev_hyps = [hyps[0] for hyps in dev_hyps]
                        if args.valid_metric == 'bleu':
                            valid_metric = get_bleu(
                                [tgt for src, tgt in dev_data], dev_hyps)
                        else:
                            valid_metric = get_acc(
                                [tgt for src, tgt in dev_data],
                                dev_hyps,
                                acc_type=args.valid_metric)
                        _log = 'validation: iter %d, dev. ppl %f, dev. %s %f' % (
                            train_iter, dev_ppl, args.valid_metric,
                            valid_metric)
                        print(_log)
                        print(_log, file=validation_output)
                        if args.notify_slack:
                            slack.post(_log)

                    else:
                        valid_metric = -dev_ppl
                        print('validation: iter %d, dev. ppl %f' %
                              (train_iter, dev_ppl),
                              file=sys.stderr)

                    if 'dev_data' in log_data:
                        log_data['dev_data'] = dev_data

                    _list_dict_update(log_data, {
                        'epoch': epoch,
                        'train_iter': train_iter,
                        'loss': dev_loss,
                        'ppl': dev_ppl,
                        args.valid_metric: valid_metric,
                        'hyps': dev_hyps,
                    },
                                      'validation',
                                      is_save=True)

                    model.train()

                    is_better = len(
                        hist_valid_scores
                    ) == 0 or valid_metric > max(hist_valid_scores)
                    is_better_than_last = len(
                        hist_valid_scores
                    ) == 0 or valid_metric > hist_valid_scores[-1]
                    hist_valid_scores.append(valid_metric)

                    if valid_num > args.save_model_after:
                        model_file = args.save_to + '.iter%d.bin' % train_iter
                        print('save model to [%s]' % model_file)
                        model.save(model_file)

                    if (not is_better_than_last) and args.lr_decay:
                        lr = optimizer.param_groups[0]['lr'] * args.lr_decay
                        print('decay learning rate to %f' % lr)
                        optimizer.param_groups[0]['lr'] = lr

                    if is_better:
                        patience = 0
                        best_model_iter = train_iter

                        if valid_num > args.save_model_after:
                            print('save currently the best model ..')
                            model_file_abs_path = os.path.abspath(model_file)
                            symlin_file_abs_path = os.path.abspath(
                                args.save_to + '.bin')
                            os.system(
                                'ln -sf %s %s' %
                                (model_file_abs_path, symlin_file_abs_path))
                    else:
                        patience += 1
                        print('hit patience %d' % patience)
                        if patience == args.patience:
                            _log = f"""
                            {'hit patience %d' % patience}
                            early stop!
                            {'the best model is from iteration [%d]' % best_model_iter}
                            """
                            print(_log)
                            if args.notify_slack:
                                slack.post(_log)
                            exit(0)

                if args.debug:
                    print(f'debug epoch:{epoch} exit')
                    model_file = args.save_to + '.bin'
                    print('save model to [%s]' % model_file)
                    model.save(model_file)
                    exit(0)
Example #12
0
def train(args):
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')

    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tgt, source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    dev_data = dev_data[:args.dev_limit]

    vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args)

    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()

    _info = f"""
        begin Maximum Likelihood training
        ・学習:{len(train_data)}ペア, {args.train_log_file}
        ・テスト:{len(dev_data)}ペア, {args.valid_niter}iter毎 {args.validation_log_file}
        ・バッチサイズ:{args.batch_size}
        ・1epoch = {len(train_data)}ペア = {int(len(train_data)/args.batch_size)}iter
        """
    print(_info)

    if args.notify_slack:
        slack.post(f"""
        {_info}
        {args}
        """)

    with open(args.train_log_file,
              "w") as train_output, open(args.validation_log_file,
                                         "w") as validation_output:

        while True:
            epoch += 1
            for src_sents, tgt_sents in data_iter(train_data,
                                                  batch_size=args.batch_size):
                train_iter += 1

                src_sents_var = to_input_variable(src_sents,
                                                  vocab.src,
                                                  cuda=args.cuda)
                tgt_sents_var = to_input_variable(tgt_sents,
                                                  vocab.tgt,
                                                  cuda=args.cuda)

                batch_size = len(src_sents)
                src_sents_len = [len(s) for s in src_sents]
                pred_tgt_word_num = sum(
                    len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`

                optimizer.zero_grad()

                # (tgt_sent_len, batch_size, tgt_vocab_size)
                scores = model(src_sents_var, src_sents_len,
                               tgt_sents_var[:-1])

                word_loss = cross_entropy_loss(scores.view(-1, scores.size(2)),
                                               tgt_sents_var[1:].view(-1))
                loss = word_loss / batch_size
                word_loss_val = word_loss.item()
                loss_val = loss.item()

                loss.backward()
                # clip gradient
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.clip_grad)
                optimizer.step()

                report_loss += word_loss_val
                cum_loss += word_loss_val
                report_tgt_words += pred_tgt_word_num
                cum_tgt_words += pred_tgt_word_num
                report_examples += batch_size
                cum_examples += batch_size
                cum_batches += batch_size

                if train_iter % args.log_every == 0:
                    _log = 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                           'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                              report_loss / report_examples,
                                                                                              np.exp(
                                                                                                  report_loss / report_tgt_words),
                                                                                              cum_examples,
                                                                                              report_tgt_words / (
                                                                                                      time.time() - train_time),
                                                                                              time.time() - begin_time)
                    print(_log)
                    print(_log, file=train_output)

                    train_time = time.time()
                    report_loss = report_tgt_words = report_examples = 0.

                # perform validation
                if train_iter % args.valid_niter == 0:
                    print(
                        'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                        % (epoch, train_iter, cum_loss / cum_batches,
                           np.exp(cum_loss / cum_tgt_words), cum_examples),
                        file=sys.stderr)

                    cum_loss = cum_batches = cum_tgt_words = 0.
                    valid_num += 1

                    print('begin validation ...', file=sys.stderr)
                    model.eval()

                    # compute dev. ppl and bleu

                    dev_loss = evaluate_loss(model, dev_data,
                                             cross_entropy_loss)
                    dev_ppl = np.exp(dev_loss)

                    if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']:
                        dev_hyps = decode(model, dev_data)
                        dev_hyps = [hyps[0] for hyps in dev_hyps]
                        if args.valid_metric == 'bleu':
                            valid_metric = get_bleu(
                                [tgt for src, tgt in dev_data], dev_hyps)
                        else:
                            valid_metric = get_acc(
                                [tgt for src, tgt in dev_data],
                                dev_hyps,
                                acc_type=args.valid_metric)
                        _log = 'validation: iter %d, dev. ppl %f, dev. %s %f' % (
                            train_iter, dev_ppl, args.valid_metric,
                            valid_metric)
                        print(_log, file=sys.stderr)
                        print(_log, file=validation_output)
                        if args.notify_slack:
                            slack.post(_log)

                    else:
                        valid_metric = -dev_ppl
                        print('validation: iter %d, dev. ppl %f' %
                              (train_iter, dev_ppl),
                              file=sys.stderr)

                    model.train()

                    is_better = len(
                        hist_valid_scores
                    ) == 0 or valid_metric > max(hist_valid_scores)
                    is_better_than_last = len(
                        hist_valid_scores
                    ) == 0 or valid_metric > hist_valid_scores[-1]
                    hist_valid_scores.append(valid_metric)

                    if valid_num > args.save_model_after:
                        model_file = args.save_to + '.iter%d.bin' % train_iter
                        print('save model to [%s]' % model_file,
                              file=sys.stderr)
                        model.save(model_file)

                    if (not is_better_than_last) and args.lr_decay:
                        lr = optimizer.param_groups[0]['lr'] * args.lr_decay
                        print('decay learning rate to %f' % lr,
                              file=sys.stderr)
                        optimizer.param_groups[0]['lr'] = lr

                    if is_better:
                        patience = 0
                        best_model_iter = train_iter

                        if valid_num > args.save_model_after:
                            print('save currently the best model ..',
                                  file=sys.stderr)
                            model_file_abs_path = os.path.abspath(model_file)
                            symlin_file_abs_path = os.path.abspath(
                                args.save_to + '.bin')
                            os.system(
                                'ln -sf %s %s' %
                                (model_file_abs_path, symlin_file_abs_path))
                    else:
                        patience += 1
                        print('hit patience %d' % patience, file=sys.stderr)
                        if patience == args.patience:
                            print('early stop!', file=sys.stderr)
                            print('the best model is from iteration [%d]' %
                                  best_model_iter,
                                  file=sys.stderr)
                            exit(0)
Example #13
0
def compute_lm_prob(args):
    """
    given source-target sentence pairs, compute ppl and log-likelihood
    """
    test_data_src = read_corpus(args.test_src, source='src')
    test_data_tgt = read_corpus(args.test_tgt, source='tgt')
    test_data = zip(test_data_src, test_data_tgt)

    if args.load_model:
        print('load model from [%s]' % args.load_model)
        params = torch.load(args.load_model,
                            map_location=lambda storage, loc: storage)
        vocab = params['vocab']
        saved_args = params['args']
        state_dict = params['state_dict']

        model = NMT(saved_args, vocab)
        model.load_state_dict(state_dict)
    else:
        vocab = torch.load(args.vocab)
        model = NMT(args, vocab)

    model.eval()

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()

    f = open(args.save_to_file, 'w')
    for src_sent, tgt_sent in test_data:
        src_sents = [src_sent]
        tgt_sents = [tgt_sent]

        batch_size = len(src_sents)
        src_sents_len = [len(s) for s in src_sents]
        pred_tgt_word_nums = [len(s[1:])
                              for s in tgt_sents]  # omitting leading `<s>`

        # (sent_len, batch_size)
        src_sents_var = to_input_variable(src_sents,
                                          model.vocab.src,
                                          cuda=args.cuda,
                                          is_test=True)
        tgt_sents_var = to_input_variable(tgt_sents,
                                          model.vocab.tgt,
                                          cuda=args.cuda,
                                          is_test=True)

        # (tgt_sent_len, batch_size, tgt_vocab_size)
        scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1])
        # (tgt_sent_len * batch_size, tgt_vocab_size)
        log_scores = F.log_softmax(scores.view(-1, scores.size(2)))
        # remove leading <s> in tgt sent, which is not used as the target
        # (batch_size * tgt_sent_len)
        flattened_tgt_sents = tgt_sents_var[1:].view(-1)
        # (batch_size * tgt_sent_len)
        tgt_log_scores = torch.gather(
            log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1)
        # 0-index is the <pad> symbol
        tgt_log_scores = tgt_log_scores * (
            1. - torch.eq(flattened_tgt_sents, 0).float())
        # (tgt_sent_len, batch_size)
        tgt_log_scores = tgt_log_scores.view(-1, batch_size)  # .permute(1, 0)
        # (batch_size)
        tgt_sent_scores = tgt_log_scores.sum(dim=0).squeeze()
        tgt_sent_word_scores = [
            tgt_sent_scores[i].item() / pred_tgt_word_nums[i]
            for i in range(batch_size)
        ]

        for src_sent, tgt_sent, score in zip(src_sents, tgt_sents,
                                             tgt_sent_word_scores):
            f.write('%s ||| %s ||| %f\n' %
                    (' '.join(src_sent), ' '.join(tgt_sent), score))

    f.close()
Example #14
0
        '--train_src',
        type=str,
        help='path to the source side of the training sentences')
    parser.add_argument(
        '--train_trg',
        type=str,
        help='path to the target side of the training sentences')
    parser.add_argument('--output',
                        default='vocab.bin',
                        type=str,
                        help='output vocabulary file')
    parser.add_argument('--share_vocab', action='store_true', default=False)

    args = parser.parse_args()

    print('read in parallel sentences: %s' % args.train_bitext)
    src_sents = read_corpus(args.train_src, source='src')
    trg_sents = read_corpus(args.train_trg, source='src')

    vocab = Vocab(src_sents,
                  trg_sents,
                  args.src_vocab_size,
                  args.trg_vocab_size,
                  remove_singleton=not args.include_singleton,
                  share_vocab=args.share_vocab)
    print('generated vocabulary, source %d words, target %d words' %
          (len(vocab.src), len(vocab.trg)))

    torch.save(vocab, args.output)
    print('vocabulary saved to %s' % args.output)
Example #15
0
                        required=True,
                        help='file of source sentences')
    parser.add_argument('--train_tgt',
                        type=str,
                        required=True,
                        help='file of target sentences')

    parser.add_argument('--output',
                        default='vocab.bin',
                        type=str,
                        help='output vocabulary file')

    args = parser.parse_args()

    print('read in source sentences: %s' % args.train_src)
    print('read in target sentences: %s' % args.train_tgt)

    src_sents = read_corpus(args.train_src, source='src')
    tgt_sents = read_corpus(args.train_tgt, source='tgt')

    vocab = Vocab(src_sents,
                  tgt_sents,
                  args.src_vocab_size,
                  args.tgt_vocab_size,
                  remove_singleton=not args.include_singleton)
    print('generated vocabulary, source %d words, target %d words' %
          (len(vocab.src), len(vocab.tgt)))

    torch.save(vocab, args.output)
    print('vocabulary saved to %s' % args.output)
Example #16
0
def train_raml(args):
    vocab = torch.load(args.vocab)

    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')
    train_data = zip(train_data_src, train_data_tgt)

    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tgt, source='tgt')
    dev_data = zip(dev_data_src, dev_data_tgt)

    # dict of (src, [tgt: (sent, prob)])
    print('read in raml training data...', file=sys.stderr, end='')
    begin_time = time.time()
    raml_samples = read_raml_train_data(args.raml_sample_file, temp=args.temp)
    print('done[%d s].' % (time.time() - begin_time))

    vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args)

    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    report_weighted_loss = cum_weighted_loss = 0
    cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin RAML training')

    while True:
        epoch += 1
        for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size):
            train_iter += 1

            raml_src_sents = []
            raml_tgt_sents = []
            raml_tgt_weights = []
            for src_sent in src_sents:
                tgt_samples_all = raml_samples[' '.join(src_sent)]

                if args.sample_size >= len(tgt_samples_all):
                    tgt_samples = tgt_samples_all
                else:
                    tgt_samples_id = np.random.choice(range(1, len(tgt_samples_all)), size=args.sample_size - 1, replace=False)
                    tgt_samples = [tgt_samples_all[0]] + [tgt_samples_all[i] for i in tgt_samples_id] # make sure the ground truth y* is in the samples

                raml_src_sents.extend([src_sent] * len(tgt_samples))
                raml_tgt_sents.extend([['<s>'] + sent.split(' ') + ['</s>'] for sent, weight in tgt_samples])
                raml_tgt_weights.extend([weight for sent, weight in tgt_samples])

            src_sents_var = to_input_variable(raml_src_sents, vocab.src, cuda=args.cuda)
            tgt_sents_var = to_input_variable(raml_tgt_sents, vocab.tgt, cuda=args.cuda)
            weights_var = Variable(torch.FloatTensor(raml_tgt_weights), requires_grad=False)
            if args.cuda:
                weights_var = weights_var.cuda()

            batch_size = len(raml_src_sents)  # batch_size = args.batch_size * args.sample_size
            src_sents_len = [len(s) for s in raml_src_sents]
            pred_tgt_word_num = sum(len(s[1:]) for s in raml_tgt_sents)  # omitting leading `<s>`
            optimizer.zero_grad()

            # (tgt_sent_len, batch_size, tgt_vocab_size)
            scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1])
            log_scores = F.log_softmax(scores.view(-1, scores.size(2)))
            # weights = weights_var.view(1, weights_var.size(0), 1).expand_as(scores).contiguous()
            flattened_tgt_sents = tgt_sents_var[1:].view(-1)

            # batch_size * tgt_sent_len
            tgt_log_scores = torch.gather(log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1)
            unweighted_loss = -tgt_log_scores * (1. - torch.eq(flattened_tgt_sents, 0).float())
            weighted_loss = unweighted_loss * weights_var.repeat(scores.size(0))
            weighted_loss = weighted_loss.sum()
            weighted_loss_val = weighted_loss.data[0]
            nll_loss_val = unweighted_loss.sum().data[0]
            # weighted_log_scores = log_scores * weights.view(-1, scores.size(2))
            # weighted_loss = nll_loss(weighted_log_scores, flattened_tgt_sents)

            loss = weighted_loss / batch_size
            # nll_loss_val = nll_loss(log_scores, flattened_tgt_sents).data[0]

            loss.backward()
            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad)
            optimizer.step()

            report_weighted_loss += weighted_loss_val
            cum_weighted_loss += weighted_loss_val
            report_loss += nll_loss_val
            cum_loss += nll_loss_val
            report_tgt_words += pred_tgt_word_num
            cum_tgt_words += pred_tgt_word_num
            report_examples += batch_size
            cum_examples += batch_size
            cum_batches += batch_size

            if train_iter % args.log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, '
                      'avg. ppl %.2f cum. examples %d, '
                      'speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                       report_weighted_loss / report_examples,
                                                                       np.exp(report_loss / report_tgt_words),
                                                                       cum_examples,
                                                                       report_tgt_words / (time.time() - train_time),
                                                                       time.time() - begin_time),
                      file=sys.stderr)

                train_time = time.time()
                report_loss = report_weighted_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % args.valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, '
                      'cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                          cum_weighted_loss / cum_batches,
                                                          np.exp(cum_loss / cum_tgt_words),
                                                          cum_examples),
                      file=sys.stderr)

                cum_loss = cum_weighted_loss = cum_batches = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)
                model.eval()

                # compute dev. ppl and bleu

                dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss)
                dev_ppl = np.exp(dev_loss)

                if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']:
                    dev_hyps = decode(model, dev_data)
                    dev_hyps = [hyps[0] for hyps in dev_hyps]
                    if args.valid_metric == 'bleu':
                        valid_metric = get_bleu([tgt for src, tgt in dev_data], dev_hyps)
                    else:
                        valid_metric = get_acc([tgt for src, tgt in dev_data], dev_hyps, acc_type=args.valid_metric)
                    print('validation: iter %d, dev. ppl %f, dev. %s %f' % (
                    train_iter, dev_ppl, args.valid_metric, valid_metric),
                          file=sys.stderr)
                else:
                    valid_metric = -dev_ppl
                    print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl),
                          file=sys.stderr)

                model.train()

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                is_better_than_last = len(hist_valid_scores) == 0 or valid_metric > hist_valid_scores[-1]
                hist_valid_scores.append(valid_metric)

                if valid_num > args.save_model_after:
                    model_file = args.save_to + '.iter%d.bin' % train_iter
                    print('save model to [%s]' % model_file, file=sys.stderr)
                    model.save(model_file)

                if (not is_better_than_last) and args.lr_decay:
                    lr = optimizer.param_groups[0]['lr'] * args.lr_decay
                    print('decay learning rate to %f' % lr, file=sys.stderr)
                    optimizer.param_groups[0]['lr'] = lr

                if is_better:
                    patience = 0
                    best_model_iter = train_iter

                    if valid_num > args.save_model_after:
                        print('save currently the best model ..', file=sys.stderr)
                        model_file_abs_path = os.path.abspath(model_file)
                        symlin_file_abs_path = os.path.abspath(args.save_to + '.bin')
                        os.system('ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path))
                else:
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)
                    if patience == args.patience:
                        print('early stop!', file=sys.stderr)
                        print('the best model is from iteration [%d]' % best_model_iter, file=sys.stderr)
                        exit(0)
Example #17
0
  eval_tgt = tgt[int(len(tgt) * train_ratio):int(len(tgt) * (train_ratio + eval_ratio))]
  test_tgt = tgt[int(len(tgt) * (train_ratio + eval_ratio)):]
  write_corpus(os.path.join("data", "train_" + src_file), train_src)
  write_corpus(os.path.join("data", "eval_" + src_file), eval_src)
  write_corpus(os.path.join("data", "test_" + src_file), test_src)
  write_corpus(os.path.join("data", "train_" + tgt_file), train_tgt)
  write_corpus(os.path.join("data", "eval_" + tgt_file), eval_tgt)
  write_corpus(os.path.join("data", "test_" + tgt_file), test_tgt)
  return


if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('--train_src', type=str, default='data/news-commentary-v11.de-en.en',
                      help='file of source sentences')
  parser.add_argument('--train_tgt', type=str, default='data/news-commentary-v11.de-en.de',
                      help='file of target sentences')
  parser.add_argument('--src_size', default=20000, type=int, help='source vocabulary size')
  parser.add_argument('--tgt_size', default=20000, type=int, help='target vocabulary size')
  
  args = parser.parse_args()
  
  print('read in source sentences: %s' % args.train_src)
  print('read in target sentences: %s' % args.train_tgt)
  
  src_sents = read_corpus(args.train_src, source='src', generate=True)[:args.src_size]
  tgt_sents = read_corpus(args.train_tgt, source='tgt', generate=True)[:args.tgt_size]
  
  assert len(src_sents) == len(tgt_sents)
  make_data(src_sents, tgt_sents, args.train_src.split("/")[-1], args.train_tgt.split("/")[-1])
Example #18
0
def validate_output_file(hypo_f, refer_f):
    hypo_data = read_corpus(hypo_f, "src")
    refer_data = read_corpus(refer_f, "tar")
    return compute_bleu_for_sentences(refer_data, hypo_data)
Example #19
0
def train(args):
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')

    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tgt, source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args)

    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0

    if args.load_model:
        import re
        train_iter = int(re.search('(?<=iter)\d+', args.load_model).group(0))
        print('start from train_iter = %d' % train_iter)

        valid_num = train_iter // args.valid_niter

    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1
        print('start of epoch {:d}'.format(epoch))

        for src_sents, tgt_sents in data_iter(train_data,
                                              batch_size=args.batch_size):
            train_iter += 1

            src_sents_var = to_input_variable(src_sents,
                                              vocab.src,
                                              cuda=args.cuda)
            tgt_sents_var = to_input_variable(tgt_sents,
                                              vocab.tgt,
                                              cuda=args.cuda)

            # src_sents_var = to_input_variable(src_sents, vocab.src, cuda=False)
            # tgt_sents_var = to_input_variable(tgt_sents, vocab.tgt, cuda=False)

            batch_size = len(src_sents)
            src_sents_len = [len(s) for s in src_sents]
            pred_tgt_word_num = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`

            optimizer.zero_grad()

            # (tgt_sent_len, batch_size, tgt_vocab_size)
            scores, _ = model(src_sents_var, src_sents_len, tgt_sents_var[:-1])
            # if args.cuda:
            #     tgt_sents_var = tgt_sents_var.cuda()
            word_loss = cross_entropy_loss(scores.view(-1, scores.size(2)),
                                           tgt_sents_var[1:].view(-1))
            loss = word_loss / batch_size
            word_loss_val = word_loss.data[0]
            loss_val = loss.data[0]

            loss.backward()
            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(),
                                                      args.clip_grad)
            optimizer.step()

            report_loss += word_loss_val
            cum_loss += word_loss_val
            report_tgt_words += pred_tgt_word_num
            cum_tgt_words += pred_tgt_word_num
            report_examples += batch_size
            cum_examples += batch_size
            cum_batches += batch_size

            if train_iter % args.log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         np.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % args.valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_batches,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_batches = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)
                model.eval()

                # compute dev. ppl and bleu

                dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss)
                dev_ppl = np.exp(dev_loss)

                if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']:
                    dev_hyps = decode(model, dev_data)
                    dev_hyps = [hyps[0] for hyps in dev_hyps]
                    print(dev_hyps[:3])
                    if args.valid_metric == 'bleu':
                        valid_metric = get_bleu([tgt for src, tgt in dev_data],
                                                dev_hyps, 'valid')
                    else:
                        valid_metric = get_acc([tgt for src, tgt in dev_data],
                                               dev_hyps,
                                               acc_type=args.valid_metric)
                    print(
                        'validation: iter %d, dev. ppl %f, dev. %s %f' %
                        (train_iter, dev_ppl, args.valid_metric, valid_metric),
                        file=sys.stderr)
                else:
                    valid_metric = -dev_ppl
                    print('validation: iter %d, dev. ppl %f' %
                          (train_iter, dev_ppl),
                          file=sys.stderr)

                model.train()

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                is_better_than_last = len(
                    hist_valid_scores
                ) == 0 or valid_metric > hist_valid_scores[-1]
                hist_valid_scores.append(valid_metric)

                if (not is_better_than_last) and args.lr_decay and epoch > 10:
                    lr = optimizer.param_groups[0]['lr'] * args.lr_decay
                    print('decay learning rate to %f' % lr, file=sys.stderr)
                    optimizer.param_groups[0]['lr'] = lr

                if is_better:
                    patience = 0
                    best_model_iter = train_iter

                    if valid_num > args.save_model_after:
                        model_file = args.save_to + '.iter%d.bin' % train_iter
                        print('save model to [%s]' % model_file,
                              file=sys.stderr)
                        model.save(model_file)

                        # print('save currently the best model ..', file=sys.stderr)
                        # model_file_abs_path = os.path.abspath(model_file)
                        # symlin_file_abs_path = os.path.abspath(args.save_to + '.bin')
                        # os.system('ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path))
                else:
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)
                    if patience == args.patience:
                        print('early stop!', file=sys.stderr)
                        print('the best model is from iteration [%d]' %
                              best_model_iter,
                              file=sys.stderr)
                        exit(0)
                if abs(optimizer.param_groups[0]['lr'] - 0.0) <= 1e-5:
                    print('stop! because lr is too small', file=sys.stderr)
                    print('the best model is from iteration [%d]' %
                          best_model_iter,
                          file=sys.stderr)
                    exit(0)
Example #20
0
def sample_ngram(args):
    src_sents = read_corpus(args.src, 'src')
    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
    f_out = open(args.output, 'w')

    vocab = torch.load(args.vocab)
    tgt_vocab = vocab.tgt

    smooth_bleu = args.smooth_bleu
    sm_func = None
    if smooth_bleu:
        sm_func = SmoothingFunction().method3

    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
        src_sent = ' '.join(src_sent)

        tgt_len = len(tgt_sent)
        tgt_samples = []
        tgt_samples_distort_rates = []    # how many unigrams are replaced

        # generate 100 samples

        # append itself
        tgt_samples.append(tgt_sent)
        tgt_samples_distort_rates.append(0)

        for sid in range(args.sample_size - 1):
            n = np.random.randint(1, min(tgt_len, args.max_ngram_size + 1))  # we do not replace the last token: it must be a period!

            idx = np.random.randint(tgt_len - n)
            ngram = tgt_sent[idx: idx + n]
            new_ngram = get_new_ngram(ngram, n, tgt_vocab)

            sampled_tgt_sent = list(tgt_sent)
            sampled_tgt_sent[idx: idx + n] = new_ngram

            # compute the probability of this sample
            # prob = 1. / args.max_ngram_size * 1. / (tgt_len - 1 + n) * 1 / (len(tgt_vocab) ** n)

            tgt_samples.append(sampled_tgt_sent)
            tgt_samples_distort_rates.append(n)

        # compute bleu scores or edit distances and rank the samples by bleu scores
        rewards = []
        for tgt_sample, tgt_sample_distort_rate in zip(tgt_samples, tgt_samples_distort_rates):
            if args.reward == 'bleu':
                reward = sentence_bleu([tgt_sent], tgt_sample, smoothing_function=sm_func)
            elif args.reward == 'rouge':
                rouge = Rouge()
                scores = rouge.get_scores(hyps=[' '.join(tgt_sample).decode('utf-8')], refs=[' '.join(tgt_sent).decode('utf-8')], avg=True)
                reward = sum([value['f'] for key, value in scores.items()])
            else:
                reward = -tgt_sample_distort_rate

            rewards.append(reward)

        tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: rewards[i], reverse=True)
        # convert list of tokens into a string
        tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples]

        print('*' * 50, file=f_out)
        print('source: ' + src_sent, file=f_out)
        print('%d samples' % len(tgt_samples), file=f_out)
        for i in tgt_ranks:
            print('%s ||| %f' % (tgt_samples[i], rewards[i]), file=f_out)
        print('*' * 50, file=f_out)

    f_out.close()
Example #21
0
    parser.add_argument('--train_tgt',
                        type=str,
                        required=True,
                        help='file of target sentences')

    parser.add_argument('--output',
                        default='vocab.bin',
                        type=str,
                        help='output vocabulary file')

    args = parser.parse_args()

    print('read in source sentences: %s' % args.train_src)
    print('read in target sentences: %s' % args.train_tgt)

    src_sents = read_corpus(args.train_src, source='src')[:args.src_size]
    tgt_sents = read_corpus(args.train_tgt, source='tgt')[:args.tgt_size]

    if len(src_sents) != len(tgt_sents):
        src_sents = src_sents[:min(len(src_sents), len(tgt_sents))]
        tgt_sents = tgt_sents[:min(len(src_sents), len(tgt_sents))]
    vocab = Vocab(src_sents,
                  tgt_sents,
                  args.src_vocab_size,
                  args.tgt_vocab_size,
                  remove_singleton=not args.include_singleton)
    print('generated vocabulary, source %d words, target %d words' %
          (len(vocab.src), len(vocab.tgt)))

    torch.save(vocab, args.output)
    print('vocabulary saved to %s' % args.output)
Example #22
0
        help='path to the training feature of the training sentences')
    parser.add_argument('--output',
                        default='vocab.bin',
                        type=str,
                        help='output vocabulary file')
    parser.add_argument('--share_vocab', action='store_true', default=False)
    parser.add_argument('--lowercase', action='store_true', default=False)

    args = parser.parse_args()

    print('read in parallel sentences: %s' % args.train_bitext)
    if args.train_bitext:
        src_sents, trg_sents = read_bitext(args.train_bitext)
    else:
        src_sents = read_corpus(args.train_src,
                                source='src',
                                lowercase=args.lowercase)
        trg_sents = read_corpus(args.train_trg,
                                source='src',
                                lowercase=args.lowercase)

    vocab = Vocab(src_sents,
                  trg_sents,
                  args.src_vocab_size,
                  args.trg_vocab_size,
                  remove_singleton=not args.include_singleton,
                  share_vocab=args.share_vocab,
                  pos_file=args.train_feature)
    print('generated vocabulary, source %d words, target %d words' %
          (len(vocab.src), len(vocab.trg)))