Exemple #1
0
def evaluate_autoencode_model(
    model, src, src_test,
    config, src_valid=None,
    verbose=True, metric='bleu'
):
    """Evaluate model."""
    preds = []
    ground_truths = []
    for j in xrange(0, len(src_test['data']), config['data']['batch_size']):

        print 'Decoding batch : %d out of %d ' % (j, len(src_test['data']))
        input_lines_src, lens_src, mask_src = get_autoencode_minibatch(
            src_test['data'], src['word2id'], j, config['data']['batch_size'],
            config['data']['max_src_length'], add_start=True, add_end=True
        )

        input_lines_trg = Variable(torch.LongTensor(
            [
                [src['word2id']['<s>']]
                for i in xrange(input_lines_src.size(0))
            ]
        ))

        for i in xrange(config['data']['max_src_length']):

            decoder_logit = model(input_lines_src, input_lines_trg)
            word_probs = model.decode(decoder_logit)
            decoder_argmax = word_probs.data.cpu().numpy().argmax(axis=-1)
            next_preds = Variable(
                torch.from_numpy(decoder_argmax[:, -1])
            )

            input_lines_trg = torch.cat(
                (input_lines_trg, next_preds.unsqueeze(1)),
                1
            )

        input_lines_trg = input_lines_trg.data.cpu().numpy()

        input_lines_trg = [
            [src['id2word'][x] for x in line]
            for line in input_lines_trg
        ]

        output_lines_trg_gold = input_lines_src.data.cpu().numpy()
        output_lines_trg_gold = [
            [src['id2word'][x] for x in line]
            for line in output_lines_trg_gold
        ]

        for sentence_pred, sentence_real in zip(
            input_lines_trg,
            output_lines_trg_gold,
        ):
            if '</s>' in sentence_pred:
                index = sentence_pred.index('</s>')
            else:
                index = len(sentence_pred)
            preds.append(sentence_pred[:index + 1])

            if verbose:
                print ' '.join(sentence_pred[:index + 1])

            if '</s>' in sentence_real:
                index = sentence_real.index('</s>')
            else:
                index = len(sentence_real)
            if verbose:
                print ' '.join(sentence_real[:index + 1])
            if verbose:
                print '--------------------------------------'
            ground_truths.append(sentence_real[:index + 1])

    return get_bleu(preds, ground_truths)
Exemple #2
0
    src,
    src_test,
    config,
    verbose=False,
    metric='bleu',
)
logging.info('Epoch : %d : BLEU : %.5f ' % (0, bleu))

for i in xrange(1000):
    losses = []
    for j in xrange(0, len(src['data']), batch_size):

        input_lines_src, output_lines_src, lens_src, mask_src = get_autoencode_minibatch(
            src['data'],
            src['word2id'],
            j,
            batch_size,
            max_length,
            add_start=True,
            add_end=True)

        decoder_logit = model(input_lines_src)
        optimizer.zero_grad()

        loss = loss_criterion(
            decoder_logit.contiguous().view(-1, src_vocab_size),
            output_lines_src.view(-1))
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()

        if j % config['management']['monitor_loss'] == 0: