def __init__(self, score_type, bucket_cutoffs=None, case_insensitive=False): self.score_type = score_type self.scorer = scorers.create_scorer_from_profile(score_type) if bucket_cutoffs is None: bucket_cutoffs = [x * self.scorer.scale / 10.0 for x in range(1,10)] self.set_bucket_cutoffs(bucket_cutoffs, num_type='float') self.case_insensitive = case_insensitive
def setUpClass(self): self.ref, self.out1, self.out2 = _get_example_data() self.ids = list(range(len(self.ref))) self.scorer = scorers.create_scorer_from_profile("bleu", case_insensitive=False) self.cache_stats1 = self.scorer.cache_stats(self.ref, self.out1) self.cache_stats2 = self.scorer.cache_stats(self.ref, self.out2) self.n_random_retries = 10
def setUpClass(cls) -> None: example_path = os.path.join(compare_mt_root, "example") filenames = ["ted.ref.eng", "ted.sys1.eng", "ted.orig.slk"] cls.ref, cls.out, cls.src = [ load_tokens(os.path.join(example_path, name)) for name in filenames ] cls.scorer = scorers.create_scorer_from_profile("gleu", case_insensitive=False)
def generate_score_report(ref, outs, score_type='bleu', bootstrap=0, prob_thresh=0.05, meteor_directory=None, options=None, title=None, case_insensitive=False): """ Generate a report comparing overall scores of system(s) in both plain text and graphs. Args: ref: Tokens from the reference outs: Tokens from the output file(s) score_type: A string specifying the scoring type (bleu/length) bootstrap: Number of samples for significance test (0 to disable) prob_thresh: P-value threshold for significance test meteor_directory: Path to the directory of the METEOR code options: Options when using external program compare_directions: A string specifying which systems to compare title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option """ bootstrap = int(bootstrap) prob_thresh = float(prob_thresh) case_insensitive = True if case_insensitive == 'True' else False scorer = scorers.create_scorer_from_profile( score_type, case_insensitive=case_insensitive, meteor_directory=meteor_directory, options=options) scores, strs = zip(*[scorer.score_corpus(ref, out) for out in outs]) if bootstrap != 0: direcs = [] for i in range(len(scores)): for j in range(i + 1, len(scores)): direcs.append((i, j)) wins, sys_stats = sign_utils.eval_with_paired_bootstrap( ref, outs, scorer, direcs, num_samples=bootstrap) wins = list(zip(direcs, wins)) else: wins = sys_stats = direcs = None reporter = reporters.ScoreReport(scorer=scorer, scores=scores, strs=strs, wins=wins, sys_stats=sys_stats, prob_thresh=prob_thresh, title=title) reporter.generate_report(output_fig_file=f'score-{score_type}-{bootstrap}', output_fig_format='pdf', output_directory='outputs') return reporter
def generate_sentence_examples(ref, outs, src=None, score_type='sentbleu', report_length=10, compare_directions='0-1', title=None, case_insensitive=False): """ Generate examples of sentences that satisfy some criterion, usually score of one system better Args: ref: Tokens from the reference outs: Tokens from the output file(s) src: Tokens from the source (optional) score_type: The type of scorer to use report_length: Number of sentences to print for each system being better or worse compare_directions: A string specifying which systems to compare title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option """ report_length = int(report_length) case_insensitive = True if case_insensitive == 'True' else False scorer = scorers.create_scorer_from_profile( score_type, case_insensitive=case_insensitive) direcs = arg_utils.parse_compare_directions(compare_directions) scorediff_lists = [] for (left, right) in direcs: scorediff_list = [] deduplicate_set = set() for i, (o1, o2, r) in enumerate(zip(outs[left], outs[right], ref)): if (tuple(o1), tuple(o2), tuple(r)) in deduplicate_set: continue deduplicate_set.add((tuple(o1), tuple(o2), tuple(r))) s1, str1 = scorer.score_sentence(r, o1) s2, str2 = scorer.score_sentence(r, o2) scorediff_list.append((s2 - s1, s1, s2, str1, str2, i)) scorediff_list.sort() scorediff_lists.append(scorediff_list) reporter = reporters.SentenceExampleReport(report_length=report_length, scorediff_lists=scorediff_lists, scorer=scorer, ref=ref, outs=outs, src=src, compare_directions=direcs, title=title) reporter.generate_report() return reporter
def generate_sentence_bucketed_report(ref, outs, bucket_type='score', bucket_cutoffs=None, statistic_type='count', score_measure='bleu', case_insensitive=False): """ Generate a report of sentences by bucket in both plain text and graphs Args: ref: Tokens from the reference outs: Tokens from the output file(s) bucket_type: The type of bucketing method to use score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use case_insensitive: A boolean specifying whether to turn on the case insensitive option """ case_insensitive = True if case_insensitive == 'True' else False bucketer = bucketers.create_sentence_bucketer_from_profile( bucket_type, bucket_cutoffs=bucket_cutoffs, score_type=score_measure, case_insensitive=case_insensitive) bcs = [bucketer.create_bucketed_corpus(out, ref=ref) for out in outs] if statistic_type == 'count': scorer = None aggregator = lambda out, ref: len(out) elif statistic_type == 'score': scorer = scorers.create_scorer_from_profile( score_measure, case_insensitive=case_insensitive) aggregator = lambda out, ref: scorer.score_corpus(ref, out)[0] else: raise ValueError(f'Illegal statistic_type {statistic_type}') stats = [[aggregator(out, ref) for (out, ref) in bc] for bc in bcs] reporter = reporters.SentenceReport(bucketer=bucketer, sys_stats=stats, statistic_type=statistic_type, scorer=scorer) reporter.generate_report( output_fig_file=f'sentence-{statistic_type}-{score_measure}', output_fig_format='pdf', output_directory='outputs') return reporter
def setUpClass(self): self.ref, self.out, _ = _get_example_data() self.scorer = scorers.create_scorer_from_profile("length")
def setUpClass(self): self.ref, self.out, _ = _get_example_data_detokenized() self.scorer = scorers.create_scorer_from_profile("sacrebleu")
def generate_sentence_bucketed_report(ref, outs, bucket_type='score', bucket_cutoffs=None, statistic_type='count', score_measure='bleu', label_set=None, ref_labels=None, out_labels=None, title=None, case_insensitive=False): """ Generate a report of sentences by bucket in both plain text and graphs Args: ref: Tokens from the reference outs: Tokens from the output file(s) bucket_type: The type of bucketing method to use score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. Would overwrite out_labels if specified. out_labels: output labels. title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option """ case_insensitive = True if case_insensitive == 'True' else False if ref_labels is not None: ref_labels = corpus_utils.load_tokens(ref_labels) if type(ref_labels) == str else ref_labels if len(ref_labels) != len(ref): raise ValueError(f'The number of labels should be equal to the number of sentences.') elif out_labels is not None: out_labels = arg_utils.parse_files(out_labels) if len(out_labels) != len(outs): raise ValueError(f'The number of output files should be equal to the number of output labels.') out_labels = [corpus_utils.load_tokens(out_label) if type(out_label) == str else out_label for out_label in out_labels] for out, out_label in zip(outs, out_labels): if len(out_label) != len(out): raise ValueError(f'The number of labels should be equal to the number of sentences.') bucketer = bucketers.create_sentence_bucketer_from_profile(bucket_type, bucket_cutoffs=bucket_cutoffs, score_type=score_measure, label_set=label_set, case_insensitive=case_insensitive) bcs = [bucketer.create_bucketed_corpus(out, ref=ref, ref_labels=ref_labels if ref_labels else None, out_labels=out_labels[i] if out_labels else None) for i, out in enumerate(outs)] if statistic_type == 'count': scorer = None aggregator = lambda out,ref: len(out) elif statistic_type == 'score': scorer = scorers.create_scorer_from_profile(score_measure, case_insensitive=case_insensitive) aggregator = lambda out,ref: scorer.score_corpus(ref,out)[0] else: raise ValueError(f'Illegal statistic_type {statistic_type}') stats = [[aggregator(out,ref) for (out,ref) in bc] for bc in bcs] reporter = reporters.SentenceReport(bucketer=bucketer, sys_stats=stats, statistic_type=statistic_type, scorer=scorer, title=title) reporter.generate_report(output_fig_file=f'sentence-{statistic_type}-{score_measure}', output_fig_format='pdf', output_directory='outputs') return reporter
def generate_sentence_examples(ref, outs, src=None, score_type='sentbleu', report_length=10, compare_directions='0-1', title=None, case_insensitive=False, to_cache=False, cache_dicts=None): """ Generate examples of sentences that satisfy some criterion, usually score of one system better Args: ref: Tokens from the reference outs: Tokens from the output file(s) src: Tokens from the source (optional) score_type: The type of scorer to use report_length: Number of sentences to print for each system being better or worse compare_directions: A string specifying which systems to compare title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option to_cache: Return a list of computed statistics if True cache_dicts: A list of dictionaries that store cached statistics for each output """ # check and set parameters report_length = int(report_length) if type(case_insensitive) == str: case_insensitive = True if case_insensitive == 'True' else False # compute statistics scorer = scorers.create_scorer_from_profile( score_type, case_insensitive=case_insensitive) cache_key_list = ['scores', 'strs'] scores, strs = cache_utils.extract_cache_dicts(cache_dicts, cache_key_list, len(outs)) src = [None for _ in ref] if src is None else src if cache_dicts is None: scores, strs = [], [] for out in outs: scores_i, strs_i = [], [] for (r, o, s) in zip(ref, out, src): score, string = scorer.score_sentence(r, o, s) scores_i.append(score) strs_i.append(string) scores.append(scores_i) strs.append(strs_i) if to_cache: cache_dict = cache_utils.return_cache_dict(cache_key_list, [scores, strs]) return cache_dict direcs = arg_utils.parse_compare_directions(compare_directions) scorediff_lists = [] for (left, right) in direcs: scorediff_list = [] deduplicate_set = set() for i, (o1, o2, r) in enumerate(zip(outs[left], outs[right], ref)): if (tuple(o1), tuple(o2), tuple(r)) in deduplicate_set: continue deduplicate_set.add((tuple(o1), tuple(o2), tuple(r))) s1, str1 = scores[left][i], strs[left][i] s2, str2 = scores[right][i], strs[right][i] scorediff_list.append((s2 - s1, s1, s2, str1, str2, i)) scorediff_list.sort() scorediff_lists.append(scorediff_list) # generate reports reporter = reporters.SentenceExampleReport(report_length=report_length, scorediff_lists=scorediff_lists, scorer=scorer, ref=ref, outs=outs, src=src, compare_directions=direcs, title=title) reporter.generate_report() return reporter
def generate_sentence_bucketed_report(ref, outs, src=None, bucket_type='score', bucket_cutoffs=None, statistic_type='count', score_measure='sentbleu', label_set=None, ref_labels=None, out_labels=None, title=None, case_insensitive=False, output_bucket_details=False, to_cache=False, cache_dicts=None): """ Generate a report of sentences by bucket in both plain text and graphs Args: ref: Tokens from the reference outs: Tokens from the output file(s) bucket_type: The type of bucketing method to use score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. Would overwrite out_labels if specified. out_labels: output labels. title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option output_bucket_details: A boolean specifying whether to output the number of words in each bucket to_cache: Return a list of computed statistics if True cache_dicts: A list of dictionaries that store cached statistics for each output """ # check and set parameters if type(case_insensitive) == str: case_insensitive = True if case_insensitive == 'True' else False if type(output_bucket_details) == str: output_bucket_details = True if output_bucket_details == 'True' else False if ref_labels is not None: ref_labels = corpus_utils.load_tokens(ref_labels) if type( ref_labels) == str else ref_labels if len(ref_labels) != len(ref): raise ValueError( f'The number of labels should be equal to the number of sentences.' ) elif out_labels is not None: out_labels = arg_utils.parse_files(out_labels) if len(out_labels) != len(outs): raise ValueError( f'The number of output files should be equal to the number of output labels.' ) out_labels = [ corpus_utils.load_tokens(out_label) if type(out_label) == str else out_label for out_label in out_labels ] for out, out_label in zip(outs, out_labels): if len(out_label) != len(out): raise ValueError( f'The number of labels should be equal to the number of sentences.' ) # compute statistics bucketer = bucketers.create_sentence_bucketer_from_profile( bucket_type, bucket_cutoffs=bucket_cutoffs, score_type=score_measure, label_set=label_set, case_insensitive=case_insensitive) src = [None for _ in ref] if src is None else src if statistic_type == 'count': scorer = None if bucket_type != 'score' and bucket_type != 'lengthdiff': ref = ref_label = None aggregator = lambda out, refs, src: len(out) elif statistic_type == 'score': scorer = scorers.create_scorer_from_profile( score_measure, case_insensitive=case_insensitive) aggregator = lambda out, ref, src: scorer.score_corpus(ref, out, src)[0 ] else: raise ValueError(f'Illegal statistic_type {statistic_type}') cache_key_list = ['stats'] stats = cache_utils.extract_cache_dicts(cache_dicts, cache_key_list, len(outs)) if cache_dicts is None: bcs = [ bucketer.create_bucketed_corpus( out, ref=ref, src=src, ref_labels=ref_labels if ref_labels else None, out_labels=out_labels[i] if out_labels else None) for i, out in enumerate(outs) ] stats = [[aggregator(out, ref, src) for (out, ref, src) in bc] for bc in bcs] if output_bucket_details and statistic_type == 'score': bucket_cnt_calculator = lambda out, ref, src: len(out) bucket_interval_calculator = lambda out, ref: sign_utils.eval_with_paired_bootstrap( ref, [out], src, scorer, None)[1][0] if cache_dicts is not None: # we don't cache bcs bcs = [ bucketer.create_bucketed_corpus( out, ref=ref, src=src, ref_labels=ref_labels if ref_labels else None, out_labels=out_labels[i] if out_labels else None) for i, out in enumerate(outs) ] bucket_cnts = [ bucket_cnt_calculator(out, ref, src) for (out, ref, src) in bcs[0] ] bucket_intervals = [[ bucket_interval_calculator(out, ref, src) for (out, ref, src) in bc ] for bc in bcs] else: bucket_cnts = bucket_intervals = None if to_cache: cache_dict = cache_utils.return_cache_dict(cache_key_list, [stats]) return cache_dict # generate reports reporter = reporters.SentenceReport(bucketer=bucketer, sys_stats=stats, statistic_type=statistic_type, scorer=scorer, bucket_cnts=bucket_cnts, bucket_intervals=bucket_intervals, title=title) reporter.generate_report( output_fig_file=f'sentence-{statistic_type}-{score_measure}', output_fig_format='pdf', output_directory='outputs') return reporter
def generate_score_report(ref, outs, src=None, score_type='bleu', bootstrap=0, prob_thresh=0.05, meteor_directory=None, options=None, title=None, case_insensitive=False, to_cache=False, cache_dicts=None): """ Generate a report comparing overall scores of system(s) in both plain text and graphs. Args: ref: Tokens from the reference outs: Tokens from the output file(s) src: Tokens for the source score_type: A string specifying the scoring type (bleu/length) bootstrap: Number of samples for significance test (0 to disable) prob_thresh: P-value threshold for significance test meteor_directory: Path to the directory of the METEOR code options: Options when using external program compare_directions: A string specifying which systems to compare title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option to_cache: Return a list of computed statistics if True cache_dicts: A list of dictionaries that store cached statistics for each output """ # check and set parameters bootstrap = int(bootstrap) prob_thresh = float(prob_thresh) if type(case_insensitive) == str: case_insensitive = True if case_insensitive == 'True' else False # compute statistics scorer = scorers.create_scorer_from_profile( score_type, case_insensitive=case_insensitive, meteor_directory=meteor_directory, options=options) cache_key_list = ['scores', 'strs', 'sign_stats'] scores, strs, sign_stats = cache_utils.extract_cache_dicts( cache_dicts, cache_key_list, len(outs)) if cache_dicts is None: scores, strs = zip( *[scorer.score_corpus(ref, out, src=src) for out in outs]) if to_cache: cache_dict = cache_utils.return_cache_dict( cache_key_list, [scores, strs, [scorer.cache_stats(ref, outs[0], src=src)]]) return cache_dict if bootstrap != 0: direcs = [] for i in range(len(scores)): for j in range(i + 1, len(scores)): direcs.append((i, j)) wins, sys_stats = sign_utils.eval_with_paired_bootstrap( ref, outs, src, scorer, direcs, num_samples=bootstrap, cache_stats=sign_stats) wins = list(zip(direcs, wins)) else: wins = sys_stats = None # generate reports reporter = reporters.ScoreReport(scorer=scorer, scores=scores, strs=strs, wins=wins, sys_stats=sys_stats, prob_thresh=prob_thresh, title=title) reporter.generate_report(output_fig_file=f'score-{score_type}-{bootstrap}', output_fig_format='pdf', output_directory='outputs') return reporter