def generate_sentence_examples(ref, outs, src=None, score_type='sentbleu', report_length=10, compare_directions='0-1', title=None, case_insensitive=False): """ Generate examples of sentences that satisfy some criterion, usually score of one system better Args: ref: Tokens from the reference outs: Tokens from the output file(s) src: Tokens from the source (optional) score_type: The type of scorer to use report_length: Number of sentences to print for each system being better or worse compare_directions: A string specifying which systems to compare title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option """ report_length = int(report_length) case_insensitive = True if case_insensitive == 'True' else False scorer = scorers.create_scorer_from_profile( score_type, case_insensitive=case_insensitive) direcs = arg_utils.parse_compare_directions(compare_directions) scorediff_lists = [] for (left, right) in direcs: scorediff_list = [] deduplicate_set = set() for i, (o1, o2, r) in enumerate(zip(outs[left], outs[right], ref)): if (tuple(o1), tuple(o2), tuple(r)) in deduplicate_set: continue deduplicate_set.add((tuple(o1), tuple(o2), tuple(r))) s1, str1 = scorer.score_sentence(r, o1) s2, str2 = scorer.score_sentence(r, o2) scorediff_list.append((s2 - s1, s1, s2, str1, str2, i)) scorediff_list.sort() scorediff_lists.append(scorediff_list) reporter = reporters.SentenceExampleReport(report_length=report_length, scorediff_lists=scorediff_lists, scorer=scorer, ref=ref, outs=outs, src=src, compare_directions=direcs, title=title) reporter.generate_report() return reporter
def generate_ngram_report(ref, outs, min_ngram_length=1, max_ngram_length=4, report_length=50, alpha=1.0, compare_type='match', ref_labels=None, out_labels=None, compare_directions='0-1', case_insensitive=False): """ Generate a report comparing aggregate n-gram statistics in both plain text and graphs Args: ref: Tokens from the reference outs: Tokens from the output file(s) min_ngram_length: minimum n-gram length max_ngram_length: maximum n-gram length report_length: the number of n-grams to report alpha: when sorting n-grams for salient features, the smoothing coefficient. A higher smoothing coefficient will result in more frequent phenomena (sometimes this is good). compare_type: what type of statistic to compare (match: n-grams that match the reference, over: over-produced ngrams, under: under-produced ngrams) ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. If specified, will aggregate statistics over labels instead of n-grams. out_labels: output labels. must be specified if ref_labels is specified. compare_directions: A string specifying which systems to compare case_insensitive: A boolean specifying whether to turn on the case insensitive option """ min_ngram_length, max_ngram_length, report_length = int(min_ngram_length), int(max_ngram_length), int(report_length) alpha = float(alpha) case_insensitive = True if case_insensitive == 'True' else False if out_labels is not None: out_labels = arg_utils.parse_files(out_labels) if len(out_labels) != len(outs): raise ValueError(f'The number of output files should be equal to the number of output labels.') if type(ref_labels) == str: label_files_str = f' ref_labels={ref_labels},' for i, out_label in enumerate(out_labels): label_files_str += f' out{i}_labels={out_label},' label_files = (label_files_str) else: label_files = None if type(alpha) == str: alpha = float(alpha) if not type(ref_labels) == str and case_insensitive: ref = corpus_utils.lower(ref) outs = [corpus_utils.lower(out) for out in outs] ref_labels = corpus_utils.load_tokens(ref_labels) if type(ref_labels) == str else ref_labels out_labels = [corpus_utils.load_tokens(out_labels[i]) if not out_labels is None else None for i in range(len(outs))] totals, matches, overs, unders = zip(*[ngram_utils.compare_ngrams(ref, out, ref_labels=ref_labels, out_labels=out_label, min_length=min_ngram_length, max_length=max_ngram_length) for out, out_label in zip(outs, out_labels)]) direcs = arg_utils.parse_compare_directions(compare_directions) scores = [] for (left, right) in direcs: if compare_type == 'match': scores.append(stat_utils.extract_salient_features(matches[left], matches[right], alpha=alpha)) elif compare_type == 'over': scores.append(stat_utils.extract_salient_features(overs[left], overs[right], alpha=alpha)) elif compare_type == 'under': scores.append(stat_utils.extract_salient_features(unders[left], unders[right], alpha=alpha)) else: raise ValueError(f'Illegal compare_type "{compare_type}"') scorelist = [sorted(score.items(), key=operator.itemgetter(1), reverse=True) for score in scores] reporter = reporters.NgramReport(scorelist=scorelist, report_length=report_length, min_ngram_length=min_ngram_length, max_ngram_length=max_ngram_length, matches=matches, compare_type=compare_type, alpha=alpha, compare_directions=direcs, label_files=label_files) reporter.generate_report(output_fig_file=f'ngram-min{min_ngram_length}-max{max_ngram_length}-{compare_type}', output_fig_format='pdf', output_directory='outputs') return reporter
def generate_sentence_examples(ref, outs, src=None, score_type='sentbleu', report_length=10, compare_directions='0-1', title=None, case_insensitive=False, to_cache=False, cache_dicts=None): """ Generate examples of sentences that satisfy some criterion, usually score of one system better Args: ref: Tokens from the reference outs: Tokens from the output file(s) src: Tokens from the source (optional) score_type: The type of scorer to use report_length: Number of sentences to print for each system being better or worse compare_directions: A string specifying which systems to compare title: A string specifying the caption of the printed table case_insensitive: A boolean specifying whether to turn on the case insensitive option to_cache: Return a list of computed statistics if True cache_dicts: A list of dictionaries that store cached statistics for each output """ # check and set parameters report_length = int(report_length) if type(case_insensitive) == str: case_insensitive = True if case_insensitive == 'True' else False # compute statistics scorer = scorers.create_scorer_from_profile( score_type, case_insensitive=case_insensitive) cache_key_list = ['scores', 'strs'] scores, strs = cache_utils.extract_cache_dicts(cache_dicts, cache_key_list, len(outs)) src = [None for _ in ref] if src is None else src if cache_dicts is None: scores, strs = [], [] for out in outs: scores_i, strs_i = [], [] for (r, o, s) in zip(ref, out, src): score, string = scorer.score_sentence(r, o, s) scores_i.append(score) strs_i.append(string) scores.append(scores_i) strs.append(strs_i) if to_cache: cache_dict = cache_utils.return_cache_dict(cache_key_list, [scores, strs]) return cache_dict direcs = arg_utils.parse_compare_directions(compare_directions) scorediff_lists = [] for (left, right) in direcs: scorediff_list = [] deduplicate_set = set() for i, (o1, o2, r) in enumerate(zip(outs[left], outs[right], ref)): if (tuple(o1), tuple(o2), tuple(r)) in deduplicate_set: continue deduplicate_set.add((tuple(o1), tuple(o2), tuple(r))) s1, str1 = scores[left][i], strs[left][i] s2, str2 = scores[right][i], strs[right][i] scorediff_list.append((s2 - s1, s1, s2, str1, str2, i)) scorediff_list.sort() scorediff_lists.append(scorediff_list) # generate reports reporter = reporters.SentenceExampleReport(report_length=report_length, scorediff_lists=scorediff_lists, scorer=scorer, ref=ref, outs=outs, src=src, compare_directions=direcs, title=title) reporter.generate_report() return reporter