コード例 #1
0
def generate_score_report(ref,
                          outs,
                          score_type='bleu',
                          bootstrap=0,
                          prob_thresh=0.05,
                          meteor_directory=None,
                          options=None,
                          title=None,
                          case_insensitive=False):
    """
  Generate a report comparing overall scores of system(s) in both plain text and graphs.

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    score_type: A string specifying the scoring type (bleu/length)
    bootstrap: Number of samples for significance test (0 to disable)
    prob_thresh: P-value threshold for significance test
    meteor_directory: Path to the directory of the METEOR code
    options: Options when using external program
    compare_directions: A string specifying which systems to compare 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
  """
    bootstrap = int(bootstrap)
    prob_thresh = float(prob_thresh)
    case_insensitive = True if case_insensitive == 'True' else False

    scorer = scorers.create_scorer_from_profile(
        score_type,
        case_insensitive=case_insensitive,
        meteor_directory=meteor_directory,
        options=options)

    scores, strs = zip(*[scorer.score_corpus(ref, out) for out in outs])

    if bootstrap != 0:
        direcs = []
        for i in range(len(scores)):
            for j in range(i + 1, len(scores)):
                direcs.append((i, j))
        wins, sys_stats = sign_utils.eval_with_paired_bootstrap(
            ref, outs, scorer, direcs, num_samples=bootstrap)
        wins = list(zip(direcs, wins))
    else:
        wins = sys_stats = direcs = None

    reporter = reporters.ScoreReport(scorer=scorer,
                                     scores=scores,
                                     strs=strs,
                                     wins=wins,
                                     sys_stats=sys_stats,
                                     prob_thresh=prob_thresh,
                                     title=title)
    reporter.generate_report(output_fig_file=f'score-{score_type}-{bootstrap}',
                             output_fig_format='pdf',
                             output_directory='outputs')
    return reporter
コード例 #2
0
def generate_sentence_bucketed_report(ref,
                                      outs,
                                      src=None,
                                      bucket_type='score',
                                      bucket_cutoffs=None,
                                      statistic_type='count',
                                      score_measure='sentbleu',
                                      label_set=None,
                                      ref_labels=None,
                                      out_labels=None,
                                      title=None,
                                      case_insensitive=False,
                                      output_bucket_details=False,
                                      to_cache=False,
                                      cache_dicts=None):
    """
  Generate a report of sentences by bucket in both plain text and graphs

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    bucket_type: The type of bucketing method to use
    score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use
    ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. Would overwrite out_labels if specified.
    out_labels: output labels. 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
    output_bucket_details: A boolean specifying whether to output the number of words in each bucket
    to_cache: Return a list of computed statistics if True
    cache_dicts: A list of dictionaries that store cached statistics for each output
  """
    # check and set parameters
    if type(case_insensitive) == str:
        case_insensitive = True if case_insensitive == 'True' else False
    if type(output_bucket_details) == str:
        output_bucket_details = True if output_bucket_details == 'True' else False

    if ref_labels is not None:
        ref_labels = corpus_utils.load_tokens(ref_labels) if type(
            ref_labels) == str else ref_labels
        if len(ref_labels) != len(ref):
            raise ValueError(
                f'The number of labels should be equal to the number of sentences.'
            )

    elif out_labels is not None:
        out_labels = arg_utils.parse_files(out_labels)
        if len(out_labels) != len(outs):
            raise ValueError(
                f'The number of output files should be equal to the number of output labels.'
            )

        out_labels = [
            corpus_utils.load_tokens(out_label)
            if type(out_label) == str else out_label
            for out_label in out_labels
        ]
        for out, out_label in zip(outs, out_labels):
            if len(out_label) != len(out):
                raise ValueError(
                    f'The number of labels should be equal to the number of sentences.'
                )

    # compute statistics
    bucketer = bucketers.create_sentence_bucketer_from_profile(
        bucket_type,
        bucket_cutoffs=bucket_cutoffs,
        score_type=score_measure,
        label_set=label_set,
        case_insensitive=case_insensitive)

    src = [None for _ in ref] if src is None else src

    if statistic_type == 'count':
        scorer = None
        if bucket_type != 'score' and bucket_type != 'lengthdiff':
            ref = ref_label = None
        aggregator = lambda out, refs, src: len(out)
    elif statistic_type == 'score':
        scorer = scorers.create_scorer_from_profile(
            score_measure, case_insensitive=case_insensitive)
        aggregator = lambda out, ref, src: scorer.score_corpus(ref, out, src)[0
                                                                              ]
    else:
        raise ValueError(f'Illegal statistic_type {statistic_type}')

    cache_key_list = ['stats']
    stats = cache_utils.extract_cache_dicts(cache_dicts, cache_key_list,
                                            len(outs))

    if cache_dicts is None:
        bcs = [
            bucketer.create_bucketed_corpus(
                out,
                ref=ref,
                src=src,
                ref_labels=ref_labels if ref_labels else None,
                out_labels=out_labels[i] if out_labels else None)
            for i, out in enumerate(outs)
        ]
        stats = [[aggregator(out, ref, src) for (out, ref, src) in bc]
                 for bc in bcs]

    if output_bucket_details and statistic_type == 'score':
        bucket_cnt_calculator = lambda out, ref, src: len(out)
        bucket_interval_calculator = lambda out, ref: sign_utils.eval_with_paired_bootstrap(
            ref, [out], src, scorer, None)[1][0]
        if cache_dicts is not None:  # we don't cache bcs
            bcs = [
                bucketer.create_bucketed_corpus(
                    out,
                    ref=ref,
                    src=src,
                    ref_labels=ref_labels if ref_labels else None,
                    out_labels=out_labels[i] if out_labels else None)
                for i, out in enumerate(outs)
            ]
        bucket_cnts = [
            bucket_cnt_calculator(out, ref, src) for (out, ref, src) in bcs[0]
        ]
        bucket_intervals = [[
            bucket_interval_calculator(out, ref, src) for (out, ref, src) in bc
        ] for bc in bcs]
    else:
        bucket_cnts = bucket_intervals = None

    if to_cache:
        cache_dict = cache_utils.return_cache_dict(cache_key_list, [stats])
        return cache_dict

    # generate reports
    reporter = reporters.SentenceReport(bucketer=bucketer,
                                        sys_stats=stats,
                                        statistic_type=statistic_type,
                                        scorer=scorer,
                                        bucket_cnts=bucket_cnts,
                                        bucket_intervals=bucket_intervals,
                                        title=title)

    reporter.generate_report(
        output_fig_file=f'sentence-{statistic_type}-{score_measure}',
        output_fig_format='pdf',
        output_directory='outputs')
    return reporter
コード例 #3
0
def generate_score_report(ref,
                          outs,
                          src=None,
                          score_type='bleu',
                          bootstrap=0,
                          prob_thresh=0.05,
                          meteor_directory=None,
                          options=None,
                          title=None,
                          case_insensitive=False,
                          to_cache=False,
                          cache_dicts=None):
    """
  Generate a report comparing overall scores of system(s) in both plain text and graphs.

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    src: Tokens for the source 
    score_type: A string specifying the scoring type (bleu/length)
    bootstrap: Number of samples for significance test (0 to disable)
    prob_thresh: P-value threshold for significance test
    meteor_directory: Path to the directory of the METEOR code
    options: Options when using external program
    compare_directions: A string specifying which systems to compare 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
    to_cache: Return a list of computed statistics if True
    cache_dicts: A list of dictionaries that store cached statistics for each output
  """
    # check and set parameters
    bootstrap = int(bootstrap)
    prob_thresh = float(prob_thresh)
    if type(case_insensitive) == str:
        case_insensitive = True if case_insensitive == 'True' else False

    # compute statistics
    scorer = scorers.create_scorer_from_profile(
        score_type,
        case_insensitive=case_insensitive,
        meteor_directory=meteor_directory,
        options=options)

    cache_key_list = ['scores', 'strs', 'sign_stats']
    scores, strs, sign_stats = cache_utils.extract_cache_dicts(
        cache_dicts, cache_key_list, len(outs))
    if cache_dicts is None:
        scores, strs = zip(
            *[scorer.score_corpus(ref, out, src=src) for out in outs])

    if to_cache:
        cache_dict = cache_utils.return_cache_dict(
            cache_key_list,
            [scores, strs, [scorer.cache_stats(ref, outs[0], src=src)]])
        return cache_dict

    if bootstrap != 0:
        direcs = []
        for i in range(len(scores)):
            for j in range(i + 1, len(scores)):
                direcs.append((i, j))
        wins, sys_stats = sign_utils.eval_with_paired_bootstrap(
            ref,
            outs,
            src,
            scorer,
            direcs,
            num_samples=bootstrap,
            cache_stats=sign_stats)
        wins = list(zip(direcs, wins))
    else:
        wins = sys_stats = None

    # generate reports
    reporter = reporters.ScoreReport(scorer=scorer,
                                     scores=scores,
                                     strs=strs,
                                     wins=wins,
                                     sys_stats=sys_stats,
                                     prob_thresh=prob_thresh,
                                     title=title)
    reporter.generate_report(output_fig_file=f'score-{score_type}-{bootstrap}',
                             output_fig_format='pdf',
                             output_directory='outputs')
    return reporter