Example #1
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    print(args)

    assert args.sys == '-' or os.path.exists(args.sys), \
        "System output file {} does not exist".format(args.sys)
    assert os.path.exists(args.ref), \
        "Reference file {} does not exist".format(args.ref)

    dict = dictionary.Dictionary()

    def readlines(fd):
        for line in fd.readlines():
            if args.ignore_case:
                yield line.lower()
            else:     
                yield line

    def score(fdsys):
        with open(args.ref) as fdref:
            scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
            for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
                sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict)
                ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict)
                scorer.add(ref_tok, sys_tok)
            print(scorer.result_string(args.order))

    if args.sys == '-':
        score(sys.stdin)
    else:
        with open(args.sys, 'r') as f:
            score(f)
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Command-line script for BLEU scoring.')
    parser.add_argument('-s', '--sys', default='-', help='system output')
    parser.add_argument('-r', '--ref', required=True, help='references')
    parser.add_argument('-o',
                        '--order',
                        default=4,
                        metavar='N',
                        type=int,
                        help='consider ngrams up to this order')
    parser.add_argument('--ignore-case',
                        action='store_true',
                        help='case-insensitive scoring')
    parser.add_argument(
        '--tokenizer_name',
        metavar='N',
        default='default',
        choices=['default', 'nltk', 'sacremoses'],
        help=
        "Which tokenizer to use. Choices are default, nltk, sacremoses. default tokenizes by splitting on white space. nltk uses "
        "nltk's word_tokenize which better takes into account punctuation. As an example "
        "'Hello, how's your day today?' would be tokenized as "
        "['Hello,' , 'how's', 'your', 'day', 'today?'] when using the default, but would instead be tokenized as "
        "['Hello', ',', 'how', ''s', 'your', 'day', 'today', '?'] when using nltk. The sacremoses tokenizer is from this package, "
        "https://github.com/alvations/sacremoses.")
    args = parser.parse_args()
    print(args)

    assert args.sys == '-' or os.path.exists(args.sys), \
        "System output file {} does not exist".format(args.sys)
    assert os.path.exists(args.ref), \
        "Reference file {} does not exist".format(args.ref)

    dict = dictionary.Dictionary()
    tokenizer_tool = tokenizer.build_tokenizer(args)

    def readlines(fd):
        for line in fd.readlines():
            if args.ignore_case:
                yield line.lower()
            yield line

    def score(fdsys):
        with open(args.ref) as fdref:
            scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
            for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
                sys_tok = tokenizer_tool.tokenize(sys_tok, dict)
                ref_tok = tokenizer_tool.tokenize(ref_tok, dict)
                scorer.add(ref_tok, sys_tok)
            print(scorer.result_string(args.order))

    if args.sys == '-':
        score(sys.stdin)
    else:
        with open(args.sys, 'r') as f:
            score(f)
    def build_dictionary(filenames):
        if args.singleSeq:
            d = dictionary.Dictionary()
        else:
            d = dictionaryWCS.DictionaryWCS()
        for filename in filenames:
            tokenizer.Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.L)

        return d
Example #4
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    print(args)

    assert args.sys == "-" or os.path.exists(
        args.sys), "System output file {} does not exist".format(args.sys)
    assert os.path.exists(args.ref), "Reference file {} does not exist".format(
        args.ref)

    dict = dictionary.Dictionary()

    def readlines(fd):
        for line in fd.readlines():
            if args.ignore_case:
                yield line.lower()
            else:
                yield line

    if args.sacrebleu:
        import sacrebleu

        def score(fdsys):
            with open(args.ref) as fdref:
                print(sacrebleu.corpus_bleu(fdsys, [fdref]))

    elif args.sentence_bleu:

        def score(fdsys):
            with open(args.ref) as fdref:
                scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
                for i, (sys_tok, ref_tok) in enumerate(
                        zip(readlines(fdsys), readlines(fdref))):
                    scorer.reset(one_init=True)
                    sys_tok = dict.encode_line(sys_tok)
                    ref_tok = dict.encode_line(ref_tok)
                    scorer.add(ref_tok, sys_tok)
                    print(i, scorer.result_string(args.order))

    else:

        def score(fdsys):
            with open(args.ref) as fdref:
                scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
                for sys_tok, ref_tok in zip(readlines(fdsys),
                                            readlines(fdref)):
                    sys_tok = dict.encode_line(sys_tok)
                    ref_tok = dict.encode_line(ref_tok)
                    scorer.add(ref_tok, sys_tok)
                print(scorer.result_string(args.order))

    if args.sys == "-":
        score(sys.stdin)
    else:
        with open(args.sys, "r") as f:
            score(f)
Example #5
0
def score(fdsys, tofile,refFile):
    dict = dictionary.Dictionary()
    with open(refFile) as fdref:
        scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
        for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
            sys_tok = dict.encode_line(sys_tok)
            ref_tok = dict.encode_line(ref_tok)
            scorer.add(ref_tok, sys_tok)
        print(scorer.result_string(args_score.order))
        with open(tofile, 'a') as f:
            f.write(scorer.result_string(args_score.order) + '\r\n')
 def build_nstack_source_dictionary(_src_file):
     d = dictionary.Dictionary()
     print(f'Build dict on src_file: {_src_file}')
     NstackTreeTokenizer.acquire_vocab_multithread(
         _src_file, d, tokenize_line, num_workers=args.workers,
         remove_root=remove_root, take_pos_tag=take_pos_tag, take_nodes=take_nodes,
         no_collapse=no_collapse,
     )
     d.finalize(
         threshold=args.thresholdsrc if src else args.thresholdtgt,
         nwords=args.nwordssrc if src else args.nwordstgt,
         padding_factor=args.padding_factor
     )
     print(f'Finish building src vocabulary: size {len(d)}')
     return d
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Command-line script for BLEU scoring.')
    parser.add_argument('-s', '--sys', default='-', help='system output')
    parser.add_argument('-r', '--ref', required=True, help='references')
    parser.add_argument('-o',
                        '--order',
                        default=4,
                        metavar='N',
                        type=int,
                        help='consider ngrams up to this order')
    parser.add_argument('--ignore-case',
                        action='store_true',
                        help='case-insensitive scoring')

    args = parser.parse_args()
    print(args)

    assert args.sys == '-' or os.path.exists(args.sys), \
        "System output file {} does not exist".format(args.sys)
    assert os.path.exists(args.ref), \
        "Reference file {} does not exist".format(args.ref)

    dict = dictionary.Dictionary()

    def readlines(fd):
        for line in fd.readlines():
            if args.ignore_case:
                yield line.lower()
            else:
                yield line

    def score(fdsys):
        with open(args.ref) as fdref:
            scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
            for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
                sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict)
                ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict)
                scorer.add(ref_tok, sys_tok)
            print(scorer.result_string(args.order))

    if args.sys == '-':
        score(sys.stdin)
    else:
        with open(args.sys, 'r') as f:
            score(f)
Example #8
0
parser.add_argument('--sys',
                    nargs='*',
                    default='',
                    metavar='FILE',
                    help='path to system output')
parser.add_argument('--ref',
                    default='',
                    metavar='FILE',
                    help='path to references')
parser.add_argument('--output',
                    default='',
                    metavar='FILE',
                    help='print outputs into a pretty format')
args = parser.parse_args()

dict = dictionary.Dictionary()
scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())


def dictolist(d):
    a = sorted(d.items(), key=lambda i: i[0])
    return [i[1] for i in a]


def load_sys(paths):
    src, tgt, hypos, log_probs = {}, {}, {}, {}
    for path in paths:
        with open(path) as f:
            for line in f:
                if line.startswith(('S-', 'T-', 'H-')):
                    i = int(line[line.find('-') + 1:line.find('\t')])
def score(args, trainer, dataset, src_dict, tgt_dict, ref_file):

    begin = time.time()

    src_dict = deepcopy(
        src_dict)  # This is necessary, generation of translations
    tgt_dict = deepcopy(
        tgt_dict
    )  # alters target dictionary messing up with the rest of training

    model = trainer.get_model()

    # Initialize data iterator
    itr = data.EpochBatchIterator(
        dataset=dataset,
        max_tokens=None,
        max_sentences=max(
            8, min(math.ceil(1024 / args.distributed_world_size), 128)),
        max_positions=args.max_positions,
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    translator = SequenceGenerator(
        [model],
        tgt_dict.get_metadata(),
        maxlen=args.max_target_positions - 1,  #do not include EOS token
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        minlen=args.min_len,
    )
    # Generate and compute BLEU
    dict = dictionary.Dictionary()
    num_sentences = 0
    predictions = []
    translations = translator.generate_batched_itr(
        itr,
        maxlen_a=args.max_len_a,
        maxlen_b=args.max_len_b,
        cuda=True,
        timer=gen_timer,
        prefix_size=args.prefix_size,
    )

    for sample_id, src_tokens, target_tokens, hypos in translations:
        # Process input and grount truth
        target_tokens = target_tokens.int().cpu()

        src_str = src_dict.string(src_tokens, args.remove_bpe)
        target_str = tgt_dict.string(target_tokens,
                                     args.remove_bpe,
                                     escape_unk=True)

        # Process top predictions
        for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo['tokens'].int().cpu(),
                src_str=src_str,
                alignment=hypo['alignment'].int().cpu()
                if hypo['alignment'] is not None else None,
                align_dict=None,
                tgt_dict=tgt_dict,
                remove_bpe=args.remove_bpe)

            # Score only the top hypothesis
            if i == 0:
                if args.sentencepiece:
                    hypo_str = hypo_str.replace(' ', '').replace('▁', ' ')
                    target_str = target_str.replace(' ', '').replace('▁', ' ')
                sys_tok = tokenizer.Tokenizer.tokenize(
                    (hypo_str.lower()
                     if not args.test_cased_bleu else hypo_str), dict)
                ref_tok = tokenizer.Tokenizer.tokenize(
                    (target_str.lower()
                     if not args.test_cased_bleu else target_str), dict)
                if not args.sentencepiece:
                    hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de')
                predictions.append('{}\t{}'.format(sample_id, hypo_str))

        num_sentences += 1

    if args.distributed_world_size > 1:
        predictions = _all_gather_predictions(predictions)

    with open(os.path.join(args.data, ref_file), 'r') as reference:
        refs = [reference.readlines()]
    #reducing indexed predictions as strings is more memory efficient than reducing tuples
    predictions = [tuple(item.split('\t')) for item in predictions]
    predictions = [(int(item[0]), item[1]) for item in predictions]
    predictions.sort(key=lambda tup: tup[0])
    predictions = [
        hypo[1] + ('\n' if hypo[1][-1] != '\n' else '') for hypo in predictions
    ]
    sacrebleu_score = sacrebleu.corpus_bleu(
        predictions, refs, lowercase=not args.test_cased_bleu).score
    if args.save_predictions:
        os.makedirs(os.path.join(args.save_dir, 'predictions'), exist_ok=True)
        with open(
                os.path.join(
                    args.save_dir, 'predictions',
                    ref_file + '.pred.update_{}'.format(trainer._num_updates)),
                'w') as f:
            f.write(''.join(predictions))

    DLLogger.log(step=trainer.get_num_updates(),
                 data={
                     'inference tokens/s':
                     float(args.distributed_world_size) / gen_timer.avg
                 },
                 verbosity=0)
    DLLogger.flush()
    if gen_timer.sum != 0:
        print(
            '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
            .format(len(predictions), gen_timer.n, gen_timer.sum,
                    len(predictions) / gen_timer.sum,
                    float(args.distributed_world_size) / gen_timer.avg))

    print('| Eval completed in: {:.2f}s | {}CASED BLEU {:.2f}'.format(
        time.time() - begin, '' if args.test_cased_bleu else 'UN',
        sacrebleu_score))

    return sacrebleu_score
 def build_dictionary(filenames):
     d = dictionary.Dictionary()
     for filename in filenames:
         Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
     return d
Example #11
0
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile,
                      write_hypos, normalize):

    print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c)
    gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(
        args)
    dict = dictionary.Dictionary()
    scorer = scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dict.pad(),
            eos=dict.eos(),
            unk=dict.unk(),
        ))

    ordered_hypos = {}
    ordered_targets = {}

    for shard_id in range(len(bitext1_lst)):
        bitext1 = bitext1_lst[shard_id]
        bitext2 = bitext2_lst[shard_id]
        gen_output = gen_output_lst[shard_id]
        lm_res = lm_res_lst[shard_id]

        total = len(bitext1.rescore_source.keys())
        source_lst = []
        hypo_lst = []
        score_lst = []
        reference_lst = []
        j = 1
        best_score = -math.inf

        for i in range(total):
            # length is measured in terms of words, not bpe tokens, since models may not share the same bpe
            target_len = len(bitext1.rescore_hypo[i].split())

            if lm_res is not None:
                lm_score = lm_res.score[i]
            else:
                lm_score = 0

            if bitext2 is not None:
                bitext2_score = bitext2.rescore_score[i]
                bitext2_backwards = bitext2.backwards
            else:
                bitext2_score = None
                bitext2_backwards = None

            score = rerank_utils.get_score(
                a,
                b,
                c,
                target_len,
                bitext1.rescore_score[i],
                bitext2_score,
                lm_score=lm_score,
                lenpen=lenpen,
                src_len=bitext1.source_lengths[i],
                tgt_len=bitext1.target_lengths[i],
                bitext1_backwards=bitext1.backwards,
                bitext2_backwards=bitext2_backwards,
                normalize=normalize,
            )

            if score > best_score:
                best_score = score
                best_hypo = bitext1.rescore_hypo[i]

            if j == gen_output.num_hypos[i] or j == args.num_rescore:
                j = 1
                hypo_lst.append(best_hypo)
                score_lst.append(best_score)
                source_lst.append(bitext1.rescore_source[i])
                reference_lst.append(bitext1.rescore_target[i])

                best_score = -math.inf
                best_hypo = ""
            else:
                j += 1

        gen_keys = list(sorted(gen_output.no_bpe_target.keys()))

        for key in range(len(gen_keys)):
            if args.prefix_len is None:
                assert hypo_lst[key] in gen_output.no_bpe_hypo[
                    gen_keys[key]], ("pred and rescore hypo mismatch: i: " +
                                     str(key) + ", " + str(hypo_lst[key]) +
                                     str(gen_keys[key]) +
                                     str(gen_output.no_bpe_hypo[key]))
                sys_tok = dict.encode_line(hypo_lst[key])
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

            else:
                full_hypo = rerank_utils.get_full_from_prefix(
                    hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                sys_tok = dict.encode_line(full_hypo)
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

        # if only one set of hyper parameters is provided, write the predictions to a file
        if write_hypos:
            # recover the orinal ids from n best list generation
            for key in range(len(gen_output.no_bpe_target)):
                if args.prefix_len is None:
                    assert hypo_lst[key] in gen_output.no_bpe_hypo[
                        gen_keys[key]], ("pred and rescore hypo mismatch:" +
                                         "i:" + str(key) + str(hypo_lst[key]) +
                                         str(gen_output.no_bpe_hypo[key]))
                    ordered_hypos[gen_keys[key]] = hypo_lst[key]
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

                else:
                    full_hypo = rerank_utils.get_full_from_prefix(
                        hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                    ordered_hypos[gen_keys[key]] = full_hypo
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

    # write the hypos in the original order from nbest list generation
    if args.num_shards == (len(bitext1_lst)):
        with open(target_outfile, "w") as t:
            with open(hypo_outfile, "w") as h:
                for key in range(len(ordered_hypos)):
                    t.write(ordered_targets[key])
                    h.write(ordered_hypos[key])

    res = scorer.result_string(4)
    if write_hypos:
        print(res)
    score = rerank_utils.parse_bleu_scoring(res)
    return score
def main():
    parser = argparse.ArgumentParser(description=(
        'Extract back-translations from the stdout of fairseq-generate. '
        'If there are multiply hypotheses for a source, we only keep the first one. '
    ))
    parser.add_argument('--output', required=True, help='output prefix')
    parser.add_argument('--srclang',
                        required=True,
                        help='source language (extracted from H-* lines)')
    parser.add_argument('--tgtlang',
                        required=True,
                        help='target language (extracted from S-* lines)')
    parser.add_argument('--minlen', type=int, help='min length filter')
    parser.add_argument('--maxlen', type=int, help='max length filter')
    parser.add_argument('--ratio', type=float, help='ratio filter')
    parser.add_argument('files', nargs='*', help='input files')
    args = parser.parse_args()

    dict = dictionary.Dictionary()
    scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())

    def validate(src, tgt):
        srclen = len(src.split(' ')) if src != '' else 0
        tgtlen = len(tgt.split(' ')) if tgt != '' else 0
        if ((args.minlen is not None and
             (srclen < args.minlen or tgtlen < args.minlen))
                or (args.maxlen is not None and
                    (srclen > args.maxlen or tgtlen > args.maxlen)) or
            (args.ratio is not None and
             (max(srclen, tgtlen) / float(min(srclen, tgtlen)) > args.ratio))):
            return False
        return True

    def safe_index(toks, index, default):
        try:
            return toks[index]
        except IndexError:
            return default
#    pdb.set_trace()
    with open(args.output + '.' + args.srclang, 'w') as src_h, \
            open(args.output + '.' + args.tgtlang, 'w') as tgt_h:
        for line in tqdm(fileinput.input(args.files)):
            if line.startswith('S-'):
                #                pdb.set_trace()
                tgt = safe_index(line.rstrip().split('\t'), 1, '')
            elif line.startswith('T-'):
                gt = safe_index(line.rstrip().split('\t'), 1, '')
            elif line.startswith('H-'):
                if tgt is not None:
                    if gt is not None:
                        #                    pdb.set_trace()
                        src = safe_index(line.rstrip().split('\t'), 2, '')
                        #                    sent_score = -float(safe_index(line.rstrip().split('\t'), 1, ''))
                        #                    print(str(sent_score) + "#####" + src)
                        #                    scorer.reset(one_init=True)
                        #                    gt_tok = dict.encode_line(gt)
                        #                    tgt_tok = dict.encode_line(tgt)
                        #                    scorer.add(gt_tok, tgt_tok)
                        #                    out = scorer.result_string(1)
                        #                    print(out+' ' + src + ' ' + gt)
                        #                    pdb.set_trace()
                        #                    out = float(out.split(' ')[2].split(',')[0])
                        #                    out = float(out.split('/')[2])
                        #                    if out < 5:
                        #                       tgt = gt
                        if validate(src, tgt):
                            print(src, file=src_h)
                            print(tgt, file=tgt_h)
                        else:
                            if validate(gt, tgt):
                                print(gt, file=src_h)
                                print(tgt, file=tgt_h)


#                        print("##" + str(max(len(tgt.split(' ')), len(gt.split(' '))) / float(min(len(tgt.split(' ')), len(gt.split(' ')))))    + "##" + str(max(len(tgt.split(' ')), len(src.split(' '))) / float(min(len(tgt.split(' ')), len(src.split(' '))))))
#                        print(src, file=src_h)
#                        print(gt, file=tgt_h)
                        tgt = None
                        gt = None
Example #13
0
 def build_dictionary(filenames, src_lang=None, trg_lang=None):
     d = dictionary.Dictionary(src_lang=src_lang, trg_lang=src_lang)
     for filename in filenames:
         Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
     return d
Example #14
0
def score(args, trainer, task, epoch_itr, subset):

    begin = time.time()

    if not subset in task.datasets.keys():
        task.load_dataset(subset)

    src_dict = deepcopy(task.source_dictionary
                        )  # This is necessary, generation of translations
    tgt_dict = deepcopy(
        task.target_dictionary
    )  # alters target dictionary messing up with the rest of training

    model = trainer.get_model()

    # Initialize data iterator
    itr = data.EpochBatchIterator(
        dataset=task.dataset(subset),
        max_tokens=None,
        max_sentences=max(
            8, min(math.ceil(1024 / args.distributed_world_size), 128)),
        max_positions=model.max_positions(),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    translator = SequenceGenerator(
        [model],
        tgt_dict,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        minlen=args.min_len,
    )
    # Generate and compute BLEU
    dict = dictionary.Dictionary()
    scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
    num_sentences = 0
    has_target = True
    predictions = []
    with progress_bar.build_progress_bar(args, itr) as progress:
        translations = translator.generate_batched_itr(
            progress,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=True,
            timer=gen_timer,
            prefix_size=args.prefix_size,
        )

        wps_meter = TimeMeter()
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and grount truth
            has_target = target_tokens is not None
            target_tokens = target_tokens.int().cpu() if has_target else None

            src_str = src_dict.string(src_tokens, args.remove_bpe)
            if has_target:
                target_str = tgt_dict.string(target_tokens,
                                             args.remove_bpe,
                                             escape_unk=True)

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu()
                    if hypo['alignment'] is not None else None,
                    align_dict=None,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe)

                # Score only the top hypothesis
                if has_target and i == 0:
                    if args.sentencepiece:
                        hypo_str = hypo_str.replace(' ', '').replace('▁', ' ')
                        target_str = target_str.replace(' ',
                                                        '').replace('▁', ' ')
                    sys_tok = tokenizer.Tokenizer.tokenize(
                        (hypo_str.lower() if args.ignore_case else hypo_str),
                        dict)
                    ref_tok = tokenizer.Tokenizer.tokenize(
                        (target_str.lower()
                         if args.ignore_case else target_str), dict)
                    scorer.add(ref_tok, sys_tok)
                    if not args.sentencepiece:
                        hypo_str = tokenizer.Tokenizer.detokenize(
                            hypo_str, 'de')
                    predictions.append('{}\t{}'.format(sample_id, hypo_str))

            wps_meter.update(src_tokens.size(0))
            progress.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    if args.distributed_world_size > 1:
        _all_gather_bleu_scorer(scorer)
        predictions = _all_gather_predictions(predictions)

    with open(os.path.join(args.data, 'sacrebleu_reference.de'),
              'r') as reference:
        refs = [reference.readlines()]
    #reducing indexed predictions as strings is more memory efficient than reducing tuples
    predictions = [tuple(item.split('\t')) for item in predictions]
    predictions = [(int(item[0]), item[1]) for item in predictions]
    predictions.sort(key=lambda tup: tup[0])
    predictions = [
        hypo[1] + ('\n' if hypo[-1] != '\n' else '') for hypo in predictions
    ]
    sacrebleu_score = sacrebleu.corpus_bleu(predictions,
                                            refs,
                                            lowercase=args.ignore_case)
    print(f'|Detokenized {sacrebleu_score}')
    if gen_timer.sum != 0:
        print(
            '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
            .format(num_sentences, gen_timer.n, gen_timer.sum,
                    num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(subset, args.beam,
                                                      scorer.result_string()))

    print('| Eval completed in: {:.2f}s'.format(time.time() - begin))

    return scorer.score(order=4), sacrebleu_score.score
def score(args, trainer, task, epoch_itr, subset):

    begin = time.time()

    if not subset in task.datasets.keys():
        task.load_dataset(subset)

    src_dict = deepcopy(task.source_dictionary
                        )  # This is necessary, generation of translations
    tgt_dict = deepcopy(
        task.target_dictionary
    )  # alters target dictionary messing up with the rest of training

    model = trainer.get_model()

    #mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE, value=task.dataset(subset).__len__())
    # Initialize data iterator
    itr = data.EpochBatchIterator(
        dataset=task.dataset(subset),
        max_tokens=None,
        max_sentences=max(
            8, min(math.ceil(1024 / args.distributed_world_size), 128)),
        max_positions=model.max_positions(),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    translator = SequenceGenerator(
        [model],
        tgt_dict,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        minlen=args.min_len,
    )
    # Generate and compute BLEU
    dict = dictionary.Dictionary()
    scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
    num_sentences = 0
    has_target = True
    if args.log_translations:
        log = open(
            os.path.join(
                args.save_dir,
                'translations_epoch{}_{}'.format(epoch_itr.epoch,
                                                 args.distributed_rank)), 'w+')
    with progress_bar.build_progress_bar(args, itr) as progress:
        translations = translator.generate_batched_itr(
            progress,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=True,
            timer=gen_timer,
            prefix_size=args.prefix_size,
        )

        wps_meter = TimeMeter()
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and grount truth
            has_target = target_tokens is not None
            target_tokens = target_tokens.int().cpu() if has_target else None

            src_str = src_dict.string(src_tokens, args.remove_bpe)
            if has_target:
                target_str = tgt_dict.string(target_tokens,
                                             args.remove_bpe,
                                             escape_unk=True)

            if args.log_translations:
                log.write('S-{}\t{}\n'.format(sample_id, src_str))
                if has_target:
                    log.write('T-{}\t{}\n'.format(sample_id, target_str))

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu()
                    if hypo['alignment'] is not None else None,
                    align_dict=None,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe)
                if args.log_translations:
                    log.write('H-{}\t{}\t{}\n'.format(sample_id, hypo['score'],
                                                      hypo_str))
                    # log.write(str(hypo_tokens))
                    log.write('P-{}\t{}\n'.format(
                        sample_id, ' '.join(
                            map(
                                lambda x: '{:.4f}'.format(x),
                                hypo['positional_scores'].tolist(),
                            ))))

                # Score only the top hypothesis
                if has_target and i == 0:
                    sys_tok = tokenizer.Tokenizer.tokenize(
                        (hypo_str.lower() if args.ignore_case else hypo_str),
                        dict)
                    ref_tok = tokenizer.Tokenizer.tokenize(
                        (target_str.lower()
                         if args.ignore_case else target_str), dict)
                    scorer.add(ref_tok, sys_tok)

            wps_meter.update(src_tokens.size(0))
            progress.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    if args.distributed_world_size > 1:
        _all_gather_bleu_scorer(scorer)
    if args.log_translations:
        log.close()
    if gen_timer.sum != 0:
        print(
            '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
            .format(num_sentences, gen_timer.n, gen_timer.sum,
                    num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(subset, args.beam,
                                                      scorer.result_string()))

    print('| Eval completed in: {:.2f}s'.format(time.time() - begin))

    return scorer.score(order=4)
Example #16
0
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile,
                      score_outfile, lm_outfile, fw_outfile, bw_outfile,
                      write_hypos, normalize):

    print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c,
          "target_outfile", target_outfile, "hypo_outfile", hypo_outfile,
          "lm_outfile", lm_outfile)
    gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(
        args)
    dict = dictionary.Dictionary()
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())

    ordered_hypos = {}
    ordered_targets = {}
    ordered_scores = {}
    ordered_lm = {}
    ordered_fw = {}
    ordered_bw = {}

    for shard_id in range(len(bitext1_lst)):
        bitext1 = bitext1_lst[shard_id]
        bitext2 = bitext2_lst[shard_id]
        gen_output = gen_output_lst[shard_id]
        lm_res = lm_res_lst[shard_id]

        total = len(bitext1.rescore_source.keys())
        source_lst = []
        hypo_lst = []
        score_lst = []
        lm_lst = []
        fw_lst = []
        bw_lst = []
        reference_lst = []
        j = 1
        best_score = -math.inf

        for i in range(total):
            # length is measured in terms of words, not bpe tokens, since models may not share the same bpe
            target_len = len(bitext1.rescore_hypo[i].split())

            if lm_res is not None and i in lm_res.score:
                lm_score = lm_res.score[i]
            else:
                lm_score = 0

            if bitext2 is not None:
                bitext2_score = bitext2.rescore_score[i]
                bitext2_backwards = bitext2.backwards
            else:
                bitext2_score = None
                bitext2_backwards = None

            score = rerank_utils.get_score(a,
                                           b,
                                           c,
                                           target_len,
                                           bitext1.rescore_score[i],
                                           bitext2_score,
                                           lm_score=lm_score,
                                           lenpen=lenpen,
                                           src_len=bitext1.source_lengths[i],
                                           tgt_len=bitext1.target_lengths[i],
                                           bitext1_backwards=bitext1.backwards,
                                           bitext2_backwards=bitext2_backwards,
                                           normalize=normalize)

            if score > best_score:
                best_score = score
                best_hypo = bitext1.rescore_hypo[i]
                best_lm = lm_score
                best_fw = bitext1.rescore_score[i]
                best_bw = bitext2_score

            if j == gen_output.num_hypos[i] or j == args.num_rescore:
                j = 1

                hypo_lst.append(best_hypo)
                score_lst.append(best_score)
                lm_lst.append(best_lm)
                fw_lst.append(best_fw)
                bw_lst.append(best_bw)
                source_lst.append(bitext1.rescore_source[i])
                reference_lst.append(bitext1.rescore_target[i])

                best_score = -math.inf
                best_hypo = ""
                best_lm = -math.inf
            else:
                j += 1

        gen_keys = list(sorted(gen_output.no_bpe_target.keys()))

        for key in range(len(gen_keys)):
            if args.prefix_len is None:
                assert hypo_lst[key] in gen_output.no_bpe_hypo[
                    gen_keys[key]], ("pred and rescore hypo mismatch: i: " +
                                     str(key) + ", " + str(hypo_lst[key]) +
                                     str(gen_keys[key]) +
                                     str(gen_output.no_bpe_hypo[key]))
                sys_tok = dict.encode_line(hypo_lst[key])
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                if args.sacrebleu:
                    scorer.add_string(gen_output.no_bpe_target[gen_keys[key]],
                                      hypo_lst[key])
                else:
                    scorer.add(ref_tok, sys_tok)

            else:
                full_hypo = rerank_utils.get_full_from_prefix(
                    hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                sys_tok = dict.encode_line(full_hypo)
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                if args.sacrebleu:
                    scorer.add_string(gen_output.no_bpe_target[gen_keys[key]],
                                      hypo_lst[key])
                else:
                    scorer.add(ref_tok, sys_tok)

        # if only one set of hyper parameters is provided, write the predictions to a file
        if write_hypos:
            # recover the orinal ids from n best list generation
            for key in range(len(gen_output.no_bpe_target)):
                if args.prefix_len is None:
                    assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \
                        "pred and rescore hypo mismatch:"+"i:"+str(key)+str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key])
                    ordered_hypos[gen_keys[key]] = hypo_lst[key]
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]
                    ordered_scores[gen_keys[key]] = score_lst[key]
                    ordered_lm[gen_keys[key]] = lm_lst[key]
                    ordered_fw[gen_keys[key]] = fw_lst[key]
                    ordered_bw[gen_keys[key]] = bw_lst[key]

                else:
                    full_hypo = rerank_utils.get_full_from_prefix(
                        hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                    ordered_hypos[gen_keys[key]] = full_hypo
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]
                    ordered_scores[gen_keys[key]] = score_lst[key]
                    ordered_lm[gen_keys[key]] = lm_lst[key]
                    ordered_fw[gen_keys[key]] = fw_lst[key]
                    ordered_bw[gen_keys[key]] = bw_lst[key]

                # print("Target = " + ordered_targets[gen_keys[key]] + " Hypothesis = " + ordered_hypos[gen_keys[key]])

    # write the hypos in the original order from nbest list generation

    # print(ordered_hypos)
    # print(len(ordered_hypos))
    # print(ordered_scores)
    # print(len(ordered_scores))

    if args.num_shards == (len(bitext1_lst)):
        with open(target_outfile, 'a') as t:
            with open(hypo_outfile, 'a') as h:
                with open(score_outfile, 'a') as s:
                    with open(lm_outfile, 'a') as l:
                        with open(fw_outfile, 'a') as f:
                            with open(bw_outfile, 'a') as b:
                                for key in range(len(ordered_hypos)):
                                    t.write(ordered_targets[key])
                                    h.write(ordered_hypos[key])
                                    s.write(str(ordered_scores[key]) + "\n")
                                    l.write(str(ordered_lm[key]) + "\n")
                                    f.write(str(ordered_fw[key]) + "\n")
                                    b.write(str(ordered_bw[key]) + "\n")

    print(scorer)
    res = scorer.result_string(4)
    if write_hypos:
        print(res)

    if args.sacrebleu:
        score = res.score
    else:
        score = rerank_utils.parse_bleu_scoring(res)
    return score
Example #17
0
 def build_dictionary(filenames):
     d = dictionary.Dictionary()
     for filename in filenames:
         dictionary.Dictionary.add_file_to_dictionary(filename, d, tokenize_line, args.workers)
     return d
Example #18
0
 def build_dictionary(tokenizer, filenames, max_length=None):
     d = dictionary.Dictionary()
     for filename in filenames:
         tokenizer.add_file_to_dictionary(filename, d)
     return d