Python load_alignments 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: compare_mt.corpus_utils

메소드/함수: load_alignments

hotexamples.com에서의 예제들: 2

Python load_alignments - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 compare_mt.corpus_utils.load_alignments에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: compare_mt_main.py 프로젝트: jairsan/compare-mt

def generate_src_word_accuracy_report(ref,
                                      outs,
                                      src,
                                      ref_align_file=None,
                                      acc_type='rec',
                                      bucket_type='freq',
                                      bucket_cutoffs=None,
                                      freq_count_file=None,
                                      freq_corpus_file=None,
                                      label_set=None,
                                      src_labels=None,
                                      title=None,
                                      case_insensitive=False):
    """
  Generate a report for source word analysis in both plain text and graphs.

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    src: Tokens from the source
    ref_align_file: Alignment file for the reference
    acc_type: The type of accuracy to show (prec/rec/fmeas). Can also have multiple separated by '+'.
    bucket_type: A string specifying the way to bucket words together to calculate F-measure (freq/tag)
    bucket_cutoffs: The boundaries between buckets, specified as a colon-separated string.
    freq_corpus_file: When using "freq" as a bucketer, which corpus to use to calculate frequency.
                      By default this uses the frequency in the reference test set, but it's often more informative
                      se the frequency in the training set, in which case you specify the path of the target side
                      he training corpus.
    freq_count_file: An alternative to freq_corpus that uses a count file in "word\tfreq" format.
    src_labels: either a filename of a file full of source labels, or a list of strings corresponding to `ref`.
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
  """
    case_insensitive = True if case_insensitive == 'True' else False

    if acc_type != 'rec':
        raise ValueError(
            "Source word analysis can only use recall as an accuracy type")
    if not src or not ref_align_file:
        raise ValueError(
            "Must specify the source and the alignment file when performing source analysis."
        )

    ref_align = corpus_utils.load_alignments(ref_align_file)

    bucketer = bucketers.create_word_bucketer_from_profile(
        bucket_type,
        bucket_cutoffs=bucket_cutoffs,
        freq_count_file=freq_count_file,
        freq_corpus_file=freq_corpus_file,
        freq_data=src,
        label_set=label_set,
        case_insensitive=case_insensitive)
    statistics, examples = bucketer.calc_statistics_and_examples(
        ref, outs, src=src, src_labels=src_labels, ref_aligns=ref_align)

    reporter = reporters.WordReport(bucketer=bucketer,
                                    statistics=statistics,
                                    examples=examples,
                                    src_sents=src,
                                    ref_sents=ref,
                                    ref_aligns=ref_align,
                                    out_sents=outs,
                                    src_labels=src_labels,
                                    acc_type=acc_type,
                                    header="Source Word Accuracy Analysis",
                                    title=title)

    reporter.generate_report(output_fig_file=f'src-word-acc',
                             output_fig_format='pdf',
                             output_directory='outputs')
    return reporter

예제 #2

파일 보기

def generate_src_word_accuracy_report(ref,
                                      outs,
                                      src,
                                      ref_align_file=None,
                                      acc_type='rec',
                                      bucket_type='freq',
                                      bucket_cutoffs=None,
                                      freq_count_file=None,
                                      freq_corpus_file=None,
                                      label_set=None,
                                      src_labels=None,
                                      title=None,
                                      case_insensitive=False,
                                      output_bucket_details=False,
                                      to_cache=False,
                                      cache_dicts=None):
    """
  Generate a report for source word analysis in both plain text and graphs.

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    src: Tokens from the source
    ref_align_file: Alignment file for the reference
    acc_type: The type of accuracy to show (prec/rec/fmeas). Can also have multiple separated by '+'.
    bucket_type: A string specifying the way to bucket words together to calculate F-measure (freq/tag)
    bucket_cutoffs: The boundaries between buckets, specified as a colon-separated string.
    freq_corpus_file: When using "freq" as a bucketer, which corpus to use to calculate frequency.
                      By default this uses the frequency in the reference test set, but it's often more informative
                      se the frequency in the training set, in which case you specify the path of the target side
                      he training corpus.
    freq_count_file: An alternative to freq_corpus that uses a count file in "word\tfreq" format.
    src_labels: either a filename of a file full of source labels, or a list of strings corresponding to `ref`.
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
    output_bucket_details: A boolean specifying whether to output the number of words in each bucket
    to_cache: Return a list of computed statistics if True
    cache_dicts: A list of dictionaries that store cached statistics for each output
  """
    # check and set parameters
    if type(case_insensitive) == str:
        case_insensitive = True if case_insensitive == 'True' else False
    if type(output_bucket_details) == str:
        output_bucket_details = True if output_bucket_details == 'True' else False

    if acc_type != 'rec':
        raise ValueError(
            "Source word analysis can only use recall as an accuracy type")
    if not src or not ref_align_file:
        raise ValueError(
            "Must specify the source and the alignment file when performing source analysis."
        )
    if type(src_labels) == str:
        src_labels = corpus_utils.load_tokens(src_labels)

    ref_align = corpus_utils.load_alignments(ref_align_file)

    # compute statistics
    bucketer = bucketers.create_word_bucketer_from_profile(
        bucket_type,
        bucket_cutoffs=bucket_cutoffs,
        freq_count_file=freq_count_file,
        freq_corpus_file=freq_corpus_file,
        freq_data=src,
        label_set=label_set,
        case_insensitive=case_insensitive)

    cache_key_list = [
        'statistics', 'my_ref_total_list', 'my_out_totals_list',
        'my_out_matches_list'
    ]
    statistics, my_ref_total_list, my_out_totals_list, my_out_matches_list = cache_utils.extract_cache_dicts(
        cache_dicts, cache_key_list, len(outs))
    if cache_dicts is not None:
        my_ref_total_list = my_ref_total_list[0]
        my_out_totals_list = list(np.concatenate(my_out_totals_list, 1))
        my_out_matches_list = list(np.concatenate(my_out_matches_list, 1))
    else:
        statistics, my_ref_total_list, my_out_totals_list, my_out_matches_list = bucketer.calc_statistics(
            ref, outs, src=src, src_labels=src_labels, ref_aligns=ref_align)
    examples = bucketer.calc_examples(len(ref), len(outs), statistics,
                                      my_ref_total_list, my_out_matches_list)

    bucket_cnts, bucket_intervals = bucketer.calc_bucket_details(
        my_ref_total_list, my_out_totals_list,
        my_out_matches_list) if output_bucket_details else (None, None)

    if to_cache:
        cache_dict = cache_utils.return_cache_dict(cache_key_list, [
            statistics, [my_ref_total_list], [my_out_totals_list],
            [my_out_matches_list]
        ])
        return cache_dict

    # generate reports
    reporter = reporters.WordReport(bucketer=bucketer,
                                    statistics=statistics,
                                    examples=examples,
                                    bucket_cnts=bucket_cnts,
                                    bucket_intervals=bucket_intervals,
                                    src_sents=src,
                                    ref_sents=ref,
                                    ref_aligns=ref_align,
                                    out_sents=outs,
                                    src_labels=src_labels,
                                    acc_type=acc_type,
                                    header="Source Word Accuracy Analysis",
                                    title=title)

    reporter.generate_report(output_fig_file=f'src-word-acc',
                             output_fig_format='pdf',
                             output_directory='outputs')
    return reporter