Beispiel #1
0
def score_trads(preds, trg_loader, eval_kwargs):
    split = eval_kwargs.get('split', 'val')
    batch_size = eval_kwargs.get('batch_size', 80)
    verbose = eval_kwargs.get('verbose', 0)
    ground_truths = []
    trg_loader.reset_iterator(split)
    n = 0
    while True:
        # get batch
        data_trg = trg_loader.get_trg_batch(split, range(batch_size),
                                            batch_size)
        output_lines_trg_gold = data_trg['out_labels']
        n += batch_size
        # Decode a minibatch greedily __TODO__ add beam search decoding
        # Do the same for gold sentences
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    output_lines_trg_gold,
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos)
        if not verbose:
            verb = not (n % 1000)
        else:
            verb = verbose
        for (l, gl) in zip(preds, sent_gold):
            ground_truths.append(gl)
            if verb:
                lg.print_sampled("", gl, l)
        ix1 = data_trg['bounds']['it_max']
        if data_trg['bounds']['wrapped']:
            break
        if n >= ix1:
            print('Evaluated the required samples (%s)' % n)
            break
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    scores = {'Bleu': bleu_moses}
    return scores
Beispiel #2
0
def track_model(job_name, model, src_loader, trg_loader, eval_kwargs):
    """Evaluate model."""
    source = []
    preds = []
    ground_truths = []
    batched_alphas = []
    batched_aligns = []
    batched_activ_aligns = []
    batched_activs = []
    batched_embed_activs = []
    batch_size = eval_kwargs.get('batch_size', 1)
    assert batch_size == 1, "Batch size must be 1"
    split = eval_kwargs.get('split', 'val')
    verbose = eval_kwargs.get('verbose', 0)
    max_samples = eval_kwargs.get('max_samples', -1)
    eval_kwargs['BOS'] = trg_loader.bos
    eval_kwargs['EOS'] = trg_loader.eos
    eval_kwargs['PAD'] = trg_loader.pad
    eval_kwargs['UNK'] = trg_loader.unk
    print('src_loader ref:', src_loader.ref)
    remove_bpe = 'BPE' in src_loader.ref
    print('Removing bpe:', remove_bpe)
    logger = logging.getLogger(job_name)
    # Make sure to be in evaluation mode
    model.eval()
    offset = eval_kwargs.get('offset', 0)
    print('Starting from ', offset)
    src_loader.iterators[split] = offset
    trg_loader.iterators[split] = offset
    # src_loader.reset_iterator(split)
    # trg_loader.reset_iterator(split)
    n = 0
    while True:
        # get batch
        data_src, order = src_loader.get_src_batch(split, batch_size)
        data_trg = trg_loader.get_trg_batch(split, order, batch_size)
        n += batch_size
        if model.version == 'seq2seq':
            source = model.encoder(data_src)
            source = model.map(source)
            batch_preds, _ = model.decoder.sample(source, eval_kwargs)
        else:
            # track returns seq, alphas, aligns, activ_aligns, activs, embed_activs, clean_cstr
            batch_preds, alphas, aligns, activ_aligns, activs, embed_activs, C = model.track(data_src, eval_kwargs)
            batched_alphas.append(alphas)
            batched_aligns.append(aligns)
            batched_activ_aligns.append(activ_aligns)
            batched_activs.append(activs)
            batched_embed_activs.append(embed_activs)

        # Initialize target with <BOS> for every sentence Index = 2
        if isinstance(batch_preds, list):
            # wiht beam size unpadded preds
            sent_preds = [decode_sequence(trg_loader.get_vocab(),
                                          np.array(pred).reshape(1, -1),
                                          eos=trg_loader.eos,
                                          bos=trg_loader.bos,
                                          remove_bpe=False)[0]
                          for pred in batch_preds]
        else:
            # decode
            sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds,
                                         eos=trg_loader.eos,
                                         bos=trg_loader.bos,
                                         remove_bpe=False)
        # Do the same for gold sentences
        sent_source = decode_sequence(src_loader.get_vocab(),
                                      data_src['labels'].data.cpu().numpy(),
                                      eos=src_loader.eos,
                                      bos=src_loader.bos,
                                      remove_bpe=False)
        source.append(sent_source)
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    data_trg['out_labels'].data.cpu().numpy(),
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos,
                                    remove_bpe=False)
        if not verbose:
            verb = not (n % 300)
        else:
            verb = verbose
        for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold):
            preds.append(l)
            ground_truths.append(gl)
            if verb:
                lg.print_sampled(sl, gl, l)
        if max_samples == -1:
            ix1 = data_src['bounds']['it_max']
        else:
            ix1 = max_samples

        if data_src['bounds']['wrapped']:
            break
        if n >= ix1:
            logger.warn('Evaluated the required samples (%s)' % n)
            break
    print('Sampled %d sentences' % len(preds))
    bleu_moses, _ = corpus_bleu(preds, ground_truths)

    return {'source': source,
            'preds': preds,
            'alpha': batched_alphas,
            'align': batched_aligns,
            'activ_align': batched_activ_aligns,
            'activ': batched_activs,
            'embed_activ': batched_embed_activs,
            'channels_cst': C,
            "bleu": bleu_moses,
            }
Beispiel #3
0
def sample_model(job_name, model, src_loader, trg_loader, eval_kwargs):
    """Evaluate model."""
    preds = []
    ground_truths = []
    batch_size = eval_kwargs.get('batch_size', 1)
    split = eval_kwargs.get('split', 'val')
    verbose = eval_kwargs.get('verbose', 0)
    eval_kwargs['BOS'] = trg_loader.bos
    eval_kwargs['EOS'] = trg_loader.eos
    eval_kwargs['PAD'] = trg_loader.pad
    eval_kwargs['UNK'] = trg_loader.unk
    remove_bpe = eval_kwargs.get('remove_bpe', True)
    logger = logging.getLogger(job_name)
    model.eval()
    src_loader.reset_iterator(split)
    trg_loader.reset_iterator(split)
    n = 0
    start = time.time()
    lenpen_mode = eval_kwargs.get('lenpen_mode', 'wu')
    scorer = GNMTGlobalScorer(eval_kwargs['lenpen'], 0, 'none', lenpen_mode)

    while True:
        # get batch
        data_src, order = src_loader.get_src_batch(split, batch_size)
        data_trg = trg_loader.get_trg_batch(split, order, batch_size)
        n += batch_size
        if model.version == 'seq2seq':
            source = model.encoder(data_src)
            source = model.map(source)
            batch_preds, _ = model.decoder.sample(source, scorer, eval_kwargs)
        else:
            batch_preds, _ = model.sample(data_src, scorer, eval_kwargs)

        torch.cuda.empty_cache()  # FIXME choose an optimal freq
        # Initialize target with <BOS> for every sentence Index = 2
        if isinstance(batch_preds, list):
            # wiht beam size unpadded preds
            sent_preds = [decode_sequence(trg_loader.get_vocab(),
                                          np.array(pred).reshape(1, -1),
                                          eos=trg_loader.eos,
                                          bos=trg_loader.bos,
                                          remove_bpe=remove_bpe)[0]
                          for pred in batch_preds]
        else:
            # decode
            sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds,
                                         eos=trg_loader.eos,
                                         bos=trg_loader.bos,
                                         remove_bpe=remove_bpe)
        # Do the same for gold sentences
        sent_source = decode_sequence(src_loader.get_vocab(),
                                      data_src['labels'],
                                      eos=src_loader.eos,
                                      bos=src_loader.bos,
                                      remove_bpe=remove_bpe)
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    data_trg['out_labels'],
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos,
                                    remove_bpe=remove_bpe)
        if not verbose:
            verb = not (n % 1000)
        else:
            verb = verbose
        for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold):
            preds.append(l)
            ground_truths.append(gl)
            if verb:
                lg.print_sampled(sl, gl, l)
        ix1 = data_src['bounds']['it_max']
        # ix1 = 20
        if data_src['bounds']['wrapped']:
            break
        if n >= ix1:
            break
        del sent_source, sent_preds, sent_gold, batch_preds
    logger.warn('Sampled %d sentences in %.2f s', len(preds), time.time() - start)
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    return preds, bleu_moses
Beispiel #4
0
def evaluate_model(job_name, trainer, src_loader, trg_loader, eval_kwargs):
    """Evaluate model."""
    preds = []
    ground_truths = []
    batch_size = eval_kwargs.get('batch_size', 1)
    max_samples = eval_kwargs.get('max_samples', -1)
    split = eval_kwargs.get('split', 'val')
    verbose = eval_kwargs.get('verbose', 0)
    eval_kwargs['BOS'] = trg_loader.bos
    eval_kwargs['EOS'] = trg_loader.eos
    eval_kwargs['PAD'] = trg_loader.pad
    eval_kwargs['UNK'] = trg_loader.unk
    logger = logging.getLogger(job_name)

    # Make sure to be in evaluation mode
    model = trainer.model
    crit = trainer.criterion
    model.eval()
    src_loader.reset_iterator(split)
    trg_loader.reset_iterator(split)
    n = 0
    loss_sum = 0
    ml_loss_sum = 0
    loss_evals = 0
    start = time.time()
    while True:
        # get batch
        data_src, order = src_loader.get_src_batch(split, batch_size)
        data_trg = trg_loader.get_trg_batch(split, order, batch_size)
        n += batch_size
        if model.version == 'seq2seq':
            source = model.encoder(data_src)
            source = model.map(source)
            if trainer.criterion.version == "seq":
                losses, stats = crit(model, source, data_trg)
            else:  # ML & Token-level
                # init and forward decoder combined
                decoder_logit = model.decoder(source, data_trg)
                losses, stats = crit(decoder_logit, data_trg['out_labels'])
            batch_preds, _ = model.sample(source, eval_kwargs)
        else:
            losses, stats = crit(model(data_src, data_trg), data_trg['out_labels'])
            batch_preds, _ = model.sample(data_src, eval_kwargs)

        loss_sum += losses['final'].data.item()
        ml_loss_sum += losses['ml'].data.item()
        loss_evals = loss_evals + 1
        # Initialize target with <BOS> for every sentence Index = 2
        if isinstance(batch_preds, list):
            # wiht beam size unpadded preds
            sent_preds = [decode_sequence(trg_loader.get_vocab(),
                                          np.array(pred).reshape(1, -1),
                                          eos=trg_loader.eos,
                                          bos=trg_loader.bos)[0]
                          for pred in batch_preds]
        else:
            # decode
            sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds,
                                         eos=trg_loader.eos,
                                         bos=trg_loader.bos)
        # Do the same for gold sentences
        sent_source = decode_sequence(src_loader.get_vocab(),
                                      data_src['labels'],
                                      eos=src_loader.eos,
                                      bos=src_loader.bos)
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    data_trg['out_labels'],
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos)
        if not verbose:
            verb = not (n % 1000)
        else:
            verb = verbose
        for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold):
            preds.append(l)
            ground_truths.append(gl)
            if verb:
                lg.print_sampled(sl, gl, l)
        if max_samples == -1:
            ix1 = data_src['bounds']['it_max']
        else:
            ix1 = max_samples
        if data_src['bounds']['wrapped']:
            break
        if n >= ix1:
            break
    logger.warn('Evaluated %d samples in %.2f s', len(preds), time.time()-start)
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
Beispiel #5
0
def evaluate_split(job_name, trainer, loader, eval_kwargs):
    """Evaluate model."""
    preds = []
    ground_truths = []
    max_samples = eval_kwargs.get('max_samples', -1)
    verbose = eval_kwargs.get('verbose', 0)
    logger = logging.getLogger(job_name)

    src_loader = loader.src
    trg_loader = loader.trg
    # Make sure to be in evaluation mode
    model = trainer.model
    crit = trainer.criterion
    model.eval()
    n = 0
    loss_sum = 0
    ml_loss_sum = 0
    loss_evals = 0
    start = time.time()
    while True:
        # get batch
        sample = loader.get_batch()
        data_src = sample["src"]
        data_trg = sample["trg"]
        ntokens = sample['ntokens']
        del sample
        print('Eval ntokens:', ntokens, "batch:", data_src['labels'].size(0))
        n += data_src['labels'].size(0)
        if model.version == 'seq2seq':
            source = model.encoder(data_src)
            source = model.map(source)
            if trainer.criterion.version == "seq":
                losses, stats = crit(model, source, data_trg)
            else:  # ML & Token-level
                # init and forward decoder combined
                decoder_logit = model.decoder(source, data_trg)
                losses, stats = crit(decoder_logit, data_trg['out_labels'])
            batch_preds, _ = model.sample(source, eval_kwargs)
        else:
            losses, stats = crit(model(data_src, data_trg),
                                 data_trg['out_labels'])
            batch_preds, _ = model.sample(data_src, eval_kwargs)

        loss_sum += losses['final'].data.item()
        ml_loss_sum += losses['ml'].data.item()
        loss_evals = loss_evals + 1
        torch.cuda.empty_cache()  # FIXME choose an optimal freq
        # Initialize target with <BOS> for every sentence Index = 2
        if isinstance(batch_preds, list):
            # wiht beam size unpadded preds
            sent_preds = [
                decode_sequence(trg_loader.get_vocab(),
                                np.array(pred).reshape(1, -1),
                                eos=trg_loader.eos,
                                bos=trg_loader.bos)[0] for pred in batch_preds
            ]
        else:
            # decode
            sent_preds = decode_sequence(trg_loader.get_vocab(),
                                         batch_preds,
                                         eos=trg_loader.eos,
                                         bos=trg_loader.bos)
        # Do the same for gold sentences
        sent_source = decode_sequence(src_loader.get_vocab(),
                                      data_src['labels'],
                                      eos=src_loader.eos,
                                      bos=src_loader.bos)
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    data_trg['out_labels'],
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos)
        if not verbose:
            verb = not (n % 1000)
        else:
            verb = verbose
        for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold):
            preds.append(l)
            ground_truths.append(gl)
            if verb:
                lg.print_sampled(sl, gl, l)
        if max_samples == -1:
            ix1 = data_src['bounds']['it_max']
        else:
            ix1 = max_samples
        if data_src['bounds']['wrapped']:
            break
        if n >= ix1:
            break
    # print('Predictions lenght:', len(preds), len(ground_truths))
    # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0])
    logger.warn('Evaluated %d samples in %.2f s', len(preds),
                time.time() - start)
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
Beispiel #6
0
def evaluate_loader(job_name, trainer, loader, src_dict, trg_dict,
                    eval_kwargs):
    """Evaluate model."""
    preds = []
    ground_truths = []
    max_samples = eval_kwargs.get('max_samples', math.inf)
    verbose = eval_kwargs.get('verbose', 0)
    logger = logging.getLogger(job_name)

    # Make sure to be in evaluation mode
    model = trainer.model
    crit = trainer.criterion
    model.eval()
    n = 0
    loss_sum = 0
    ml_loss_sum = 0
    loss_evals = 0
    start = time.time()
    for i, sample in enumerate(loader, start=0):
        # get batch
        data_src = {
            "labels": sample['net_input']['src_tokens'].cuda(),
            "lengths": sample['net_input']['src_lengths'].cuda()
        }
        data_trg = {
            "labels": sample['net_input']['prev_output_tokens'].cuda(),
            "out_labels": sample['target'].cuda(),
            "lengths": sample['net_input']['src_lengths'].cuda(
            )  # modify loader to return trg lengths as well TODO
        }

        del sample
        # print("batch:", data_src['labels'].size(0))
        n += data_src['labels'].size(0)
        if model.version == 'seq2seq':
            source = model.encoder(data_src)
            source = model.map(source)
            if trainer.criterion.version == "seq":
                losses, stats = crit(model, source, data_trg)
            else:  # ML & Token-level
                # init and forward decoder combined
                decoder_logit = model.decoder(source, data_trg)
                losses, stats = crit(decoder_logit, data_trg['out_labels'])
            batch_preds, _ = model.sample(source, eval_kwargs)
        else:
            losses, stats = crit(model(data_src, data_trg),
                                 data_trg['out_labels'])
            batch_preds, _ = model.sample(data_src, eval_kwargs)

        loss_sum += losses['final'].data.item()
        ml_loss_sum += losses['ml'].data.item()
        loss_evals = loss_evals + 1
        torch.cuda.empty_cache()  # FIXME choose an optimal freq
        # Initialize target with <BOS> for every sentence Index = 2
        if isinstance(batch_preds, list):
            # wiht beam size unpadded preds
            sent_preds = [
                decode_sequence(trg_dict,
                                np.array(pred).reshape(1, -1),
                                eos=trg_dict.eos(),
                                bos=trg_dict.eos())[0] for pred in batch_preds
            ]
        else:
            # decode
            sent_preds = decode_sequence(trg_dict,
                                         batch_preds,
                                         eos=trg_dict.eos(),
                                         bos=trg_dict.eos())
        # Do the same for gold sentences
        sent_source = decode_sequence(src_dict,
                                      data_src['labels'],
                                      eos=src_dict.eos(),
                                      bos=src_dict.eos())
        sent_gold = decode_sequence(trg_dict,
                                    data_trg['out_labels'],
                                    eos=trg_dict.eos(),
                                    bos=trg_dict.eos())
        if not verbose:
            verb = not (n % 1000)
        else:
            verb = verbose
        for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold):
            preds.append(l)
            ground_truths.append(gl)
            if verb:
                lg.print_sampled(sl, gl, l)
        if n > max_samples:
            break
    # print('Predictions lenght:', len(preds), len(ground_truths))
    # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0])
    logger.warn('Evaluated %d samples in %.2f s', len(preds),
                time.time() - start)
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses