def multiple_systems_report( test_set, sys_sents_paths, orig_sents_path=None, refs_sents_paths=None, report_path='easse_report.html', tokenizer='13a', lowercase=True, metrics=DEFAULT_METRICS, system_names=None, ): ''' Create a HTML report file comparing multiple systems with automatic metrics, plots and samples. ''' sys_sents_list = [read_lines(path) for path in sys_sents_paths] orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) if system_names is None: system_names = [Path(path).name for path in sys_sents_paths] write_multiple_systems_html_report( report_path, orig_sents, sys_sents_list, refs_sents, system_names=system_names, test_set=test_set, lowercase=lowercase, tokenizer=tokenizer, metrics=metrics, )
def test_corpus_sari(): orig_sents = get_orig_sents('turkcorpus_test') refs_sents = get_refs_sents('turkcorpus_test') system_outputs_dir = get_system_outputs_dir('turkcorpus_test') hyp_sents = read_lines(system_outputs_dir / "ACCESS") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents) assert sari_score == pytest.approx( 41.381013) # Scores from MUSS https://arxiv.org/abs/2005.00352
def test_corpus_sari_plain(): orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm") ref_sents = [] for n in range(8): ref_lines = read_lines(DATA_DIR / f"test_sets/turk/test.8turkers.tok.turk.{n}") ref_sents.append(ref_lines) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress-Ls.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(36.73586275692667) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(36.5859900146575) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/EncDecA.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(34.73946658449856) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Hybrid.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(31.008109926854227) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/PBMT-R.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(37.817966679481013) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/SBMT-SARI.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(39.360477024519125)
def get_sys_sents(test_set, sys_sents_path=None): # Get system sentences to be evaluated if sys_sents_path is not None: return read_lines(sys_sents_path) else: # read the system output with click.get_text_stream('stdin', encoding='utf-8') as system_output_file: return system_output_file.read().splitlines()
def get_orig_and_refs_sents(test_set, orig_sents_path=None, refs_sents_paths=None): # Get original and reference sentences if test_set == 'custom': assert orig_sents_path is not None assert refs_sents_paths is not None if type(refs_sents_paths) == str: refs_sents_paths = refs_sents_paths.split(',') orig_sents = read_lines(orig_sents_path) refs_sents = [ read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths ] else: orig_sents = get_orig_sents(test_set) refs_sents = get_refs_sents(test_set) # Final checks assert all([len(orig_sents) == len(ref_sents) for ref_sents in refs_sents]) return orig_sents, refs_sents
def get_orig_and_refs_sents(test_set, orig_sents_path=None, refs_sents_paths=None): # Get original and reference sentences if test_set == "custom": assert orig_sents_path is not None assert refs_sents_paths is not None if type(refs_sents_paths) == str: refs_sents_paths = refs_sents_paths.split(",") orig_sents = read_lines(orig_sents_path) refs_sents = [ read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths ] else: orig_sents = get_orig_sents(test_set) refs_sents = get_refs_sents(test_set) # Final checks assert all( [len(orig_sents) == len(ref_sents) for ref_sents in refs_sents] ), f'Not same number of lines for test_set={test_set}, orig_sents_path={orig_sents_path}, refs_sents_paths={refs_sents_paths}' # noqa: E501 return orig_sents, refs_sents
def get_sents(test_set, orig_sents_path=None, sys_sents_path=None, refs_sents_paths=None): if sys_sents_path is not None: sys_sents = read_lines(sys_sents_path) else: # read the system output with click.get_text_stream('stdin', encoding='utf-8') as system_output_file: sys_sents = system_output_file.read().splitlines() if type(refs_sents_paths) == str: refs_sents_paths = refs_sents_paths.split(',') if test_set != 'custom': assert orig_sents_path is None assert refs_sents_paths is None orig_sents_path = TEST_SETS_PATHS[(test_set, 'orig')] refs_sents_paths = TEST_SETS_PATHS[(test_set, 'refs')] assert orig_sents_path is not None assert refs_sents_paths is not None orig_sents = read_lines(orig_sents_path) refs_sents = [read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths] assert len(sys_sents) == len(orig_sents) assert all([len(sys_sents) == len(ref_sents) for ref_sents in refs_sents]) return orig_sents, sys_sents, refs_sents
def test_corpus_sari_tokenize(): orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm") ref_sents = [] for n in range(8): ref_lines = read_lines(DATA_DIR / f"test_sets/turk/test.8turkers.tok.turk.{n}") ref_sents.append(ref_lines) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress-Ls.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(37.266058818588216) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(37.08210095744638) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/EncDecA.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(35.65754396121206) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Hybrid.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(31.39665078989411) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/PBMT-R.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(38.558843050332037) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/SBMT-SARI.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(39.964857928109127)
def test_corpus_sari_legacy(): orig_sents = get_orig_sents('turkcorpus_test_legacy') refs_sents = get_refs_sents('turkcorpus_test_legacy') system_outputs_dir = get_system_outputs_dir('turkcorpus_test') hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress-Ls.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(37.266058818588216) hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(37.08210095744638) hyp_sents = read_lines(system_outputs_dir / "tok.low/EncDecA.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(35.65754396121206) hyp_sents = read_lines(system_outputs_dir / "tok.low/Hybrid.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(31.39665078989411) hyp_sents = read_lines(system_outputs_dir / "tok.low/PBMT-R.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(38.558843050332037) hyp_sents = read_lines(system_outputs_dir / "tok.low/SBMT-SARI.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(39.964857928109127)
def report(test_set, input_path=None, report_path='report.html', tokenizer='13a', metrics=','.join(DEFAULT_METRICS)): """ Create a HTML report file with automatic metrics, plots and samples. """ if input_path is not None: sys_output = read_lines(input_path) else: # read the system output with click.get_text_stream('stdin', encoding='utf-8') as system_output_file: sys_output = system_output_file.read().splitlines() if test_set in ['turk', 'turk_valid']: lowercase = False phase = 'test' if test_set == 'turk' else 'valid' refs_sents = get_turk_refs_sents(phase=phase) orig_sents = get_turk_orig_sents(phase=phase) if test_set == 'hsplit': sys_output = sys_output[:70] lowercase = True refs_sents = get_hsplit_refs_sents() orig_sents = get_hsplit_orig_sents() write_html_report( report_path, orig_sents, sys_output, refs_sents, test_set_name=test_set, lowercase=lowercase, tokenizer=tokenizer, metrics=metrics, )
def evaluate_system_output( test_set, input_path=None, tokenizer='13a', metrics=','.join(VALID_METRICS), analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ if input_path is not None: sys_output = read_lines(input_path) else: # read the system output with click.get_text_stream('stdin', encoding='utf-8') as system_output_file: sys_output = system_output_file.read().splitlines() # get the metrics that need to be computed metrics = metrics.split(',') load_orig_sents = ('sari' in metrics) or ('samsa' in metrics) or analysis or quality_estimation load_refs_sents = ('sari' in metrics) or ('bleu' in metrics) or analysis # get the references from the test set if test_set in ['turk', 'turk_valid']: lowercase = False phase = 'test' if test_set == 'turk' else 'valid' if load_orig_sents: orig_sents = get_turk_orig_sents(phase=phase) if load_refs_sents: refs_sents = get_turk_refs_sents(phase=phase) if test_set in ['pwkp', 'pwkp_valid']: lowercase = True phase = 'test' if test_set == 'pwkp' else 'valid' if load_orig_sents: orig_sents = get_pwkp_orig_sents(phase=phase) if load_refs_sents: refs_sents = get_pwkp_refs_sents(phase=phase) if test_set == 'hsplit': sys_output = sys_output[:70] lowercase = True if load_orig_sents: orig_sents = get_hsplit_orig_sents() if load_refs_sents: refs_sents = get_hsplit_refs_sents() if load_orig_sents: assert len(sys_output) == len(orig_sents) if load_refs_sents: assert len(sys_output) == len(refs_sents[0]) # compute each metric if 'bleu' in metrics: bleu_score = sacrebleu.corpus_bleu(sys_output, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score click.echo(f'BLEU: {bleu_score:.2f}') if 'sari' in metrics: sari_score = corpus_sari(orig_sents, sys_output, refs_sents, tokenizer=tokenizer, lowercase=lowercase) click.echo(f'SARI: {sari_score:.2f}') if 'samsa' in metrics: samsa_score = corpus_samsa(orig_sents, sys_output, tokenizer=tokenizer, verbose=True, lowercase=lowercase) click.echo(f'SAMSA: {samsa_score:.2f}') if 'fkgl' in metrics: fkgl_score = corpus_fkgl(sys_output, tokenizer=tokenizer) click.echo(f'FKGL: {fkgl_score:.2f}') if analysis: word_level_analysis = corpus_analyse_operations(orig_sents, sys_output, refs_sents, verbose=False, as_str=True) click.echo(f'Word-level Analysis: {word_level_analysis}') if quality_estimation: quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_output, tokenizer=tokenizer, lowercase=lowercase ) quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()} click.echo(f'Quality estimation: {quality_estimation_scores}')
def get_turk_orig_sents(phase): assert phase in ['valid', 'test'] if phase == 'valid': phase = 'tune' return read_lines(DATA_DIR / f'test_sets/turk/{phase}.8turkers.tok.norm')
def get_turk_refs_sents(phase): assert phase in ['valid', 'test'] if phase == 'valid': phase = 'tune' return [read_lines(DATA_DIR / f'test_sets/turk/{phase}.8turkers.tok.turk.{i}') for i in range(8)]
def get_orig_sents(test_set): test_set = maybe_map_deprecated_test_set_to_new_test_set(test_set) return read_lines(TEST_SETS_PATHS[(test_set, 'orig')])
def get_refs_sents(test_set): test_set = maybe_map_deprecated_test_set_to_new_test_set(test_set) return [ read_lines(ref_sents_path) for ref_sents_path in TEST_SETS_PATHS[(test_set, 'refs')] ]
def get_hsplit_refs_sents(): return [read_lines(DATA_DIR / f'test_sets/hsplit/hsplit.tok.{i+1}') for i in range(4)]
def get_pwkp_refs_sents(phase): assert phase in ['valid', 'test'] return [read_lines(DATA_DIR / f'test_sets/pwkp/pwkp.{phase}.dst')]
def get_pwkp_orig_sents(phase): assert phase in ['valid', 'test'] return read_lines(DATA_DIR / f'test_sets/pwkp/pwkp.{phase}.src')