def evaluate_sentences(comps, simps, simp_preds, calc_simp_pred_bleu=False, calc_comp_simp_bleu=False): comps = [sent.lower() for sent in comps] if type(simps[0]) == type([]): simps = [[sent.lower() for sent in l] for l in simps] else: simps = [sent for sent in simps] simp_preds = [sent.lower() for sent in simp_preds] if type(simps[0]) == type([]): refs = simps else: refs = [simps] bleu = corpus_bleu(simp_preds, refs, force=True, tokenizer='none', lowercase=True) sari = corpus_sari(comps, simp_preds, refs, tokenizer="none", lowercase=True) fkgl = corpus_fkgl(simp_preds, tokenizer='none') result = (bleu, sari, fkgl) if calc_simp_pred_bleu: result = result + (corpus_bleu(simp_preds, [comps]), ) if calc_comp_simp_bleu: result = result + (corpus_bleu(comps, refs), ) return result
def get_all_scores( orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]], lowercase: bool = False, tokenizer: str = '13a', metrics: List[str] = DEFAULT_METRICS, ): scores = OrderedDict() if 'bleu' in metrics: scores['BLEU'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score if 'sari' in metrics: scores['SARI'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'samsa' in metrics: from easse.samsa import corpus_samsa scores['SAMSA'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase) if 'fkgl' in metrics: scores['FKGL'] = corpus_fkgl(sys_sents, tokenizer=tokenizer) quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase ) scores = add_dicts( scores, quality_estimation_scores, ) return {key: round(value, 2) for key, value in scores.items()}
def test_corpus_sari(): orig_sents = get_orig_sents('turkcorpus_test') refs_sents = get_refs_sents('turkcorpus_test') system_outputs_dir = get_system_outputs_dir('turkcorpus_test') hyp_sents = read_lines(system_outputs_dir / "ACCESS") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents) assert sari_score == pytest.approx( 41.381013) # Scores from MUSS https://arxiv.org/abs/2005.00352
def compute_sari(source, decoded_sents, refs): from easse.sari import corpus_sari score_list = [] for source_sent, decoded, ref in zip(source, decoded_sents, refs): decoded = [decoded] ref = [[ref]] source_sent = [source_sent] # print("decoded:", decoded) # print("Ref:", ref) sari_score = corpus_sari(orig_sents=source_sent, sys_sents=decoded, refs_sents=ref) score_list.append(sari_score) # sari_score = corpus_sari(orig_sents=refs, sys_sents=decoded_sents, refs_sents=refs) score = get_cuda(T.FloatTensor(score_list)).sum() / len(score_list) return score
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer='13a', metrics=','.join(DEFAULT_METRICS), analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ # get the metrics that need to be computed metrics = metrics.split(',') orig_sents, sys_sents, refs_sents = get_sents(test_set, orig_sents_path, sys_sents_path, refs_sents_paths) lowercase = is_test_set_lowercase(test_set) # compute each metric if 'bleu' in metrics: bleu_score = sacrebleu.corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score click.echo(f'BLEU: {bleu_score:.2f}') if 'sari' in metrics: sari_score = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) click.echo(f'SARI: {sari_score:.2f}') if 'samsa' in metrics: samsa_score = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase) click.echo(f'SAMSA: {samsa_score:.2f}') if 'fkgl' in metrics: fkgl_score = corpus_fkgl(sys_sents, tokenizer=tokenizer) click.echo(f'FKGL: {fkgl_score:.2f}') if analysis: word_level_analysis = corpus_analyse_operations(orig_sents, sys_sents, refs_sents, verbose=False, as_str=True) click.echo(f'Word-level Analysis: {word_level_analysis}') if quality_estimation: quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase ) quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()} click.echo(f'Quality estimation: {quality_estimation_scores}')
def test_corpus_sari_plain(): orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm") ref_sents = [] for n in range(8): ref_lines = read_lines(DATA_DIR / f"test_sets/turk/test.8turkers.tok.turk.{n}") ref_sents.append(ref_lines) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress-Ls.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(36.73586275692667) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(36.5859900146575) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/EncDecA.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(34.73946658449856) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Hybrid.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(31.008109926854227) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/PBMT-R.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(37.817966679481013) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/SBMT-SARI.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents, tokenizer='plain') assert sari_score == pytest.approx(39.360477024519125)
def test_corpus_sari_legacy(): orig_sents = get_orig_sents('turkcorpus_test_legacy') refs_sents = get_refs_sents('turkcorpus_test_legacy') system_outputs_dir = get_system_outputs_dir('turkcorpus_test') hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress-Ls.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(37.266058818588216) hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(37.08210095744638) hyp_sents = read_lines(system_outputs_dir / "tok.low/EncDecA.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(35.65754396121206) hyp_sents = read_lines(system_outputs_dir / "tok.low/Hybrid.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(31.39665078989411) hyp_sents = read_lines(system_outputs_dir / "tok.low/PBMT-R.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(38.558843050332037) hyp_sents = read_lines(system_outputs_dir / "tok.low/SBMT-SARI.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents, legacy=True) assert sari_score == pytest.approx(39.964857928109127)
def get_rf_from_dev(dev_df, preds_dev, max_depth=None, random_state=19): preds_df = preds_dev.copy() dev_df_grouped = dev_df.groupby("input").agg({ "output": list, "cosine_sim": list, "rouge_l": list, "input_len": max, "output_len": list }).reset_index() preds_df["ref"] = [ l for sublist in dev_df_grouped["output"].apply( lambda x: [x] * 5).tolist() for l in sublist ] preds_df["ref"] = preds_df["ref"].apply(lambda x: [[i] for i in x]) preds_df["pred_len"] = preds_df["pred"].apply( lambda x: len(get_word_tokens(x))) preds_df["input_len"] = preds_df["input"].apply( lambda x: len(get_word_tokens(x))) preds_df["sari"] = preds_df.apply(lambda x: corpus_sari( orig_sents=[x["input"]], sys_sents=[x["pred"]], refs_sents=x["ref"], ), axis=1) rf = RandomForestRegressor(n_estimators=1000, max_depth=max_depth, n_jobs=-1, random_state=random_state) X_train = preds_df[["cosine_sim", "rouge_l", "input_len", "pred_len"]] y_train = preds_df["sari"] rf.fit(X_train, y_train) return rf, preds_df
def test_corpus_sari_tokenize(): orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm") ref_sents = [] for n in range(8): ref_lines = read_lines(DATA_DIR / f"test_sets/turk/test.8turkers.tok.turk.{n}") ref_sents.append(ref_lines) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress-Ls.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(37.266058818588216) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Dress.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(37.08210095744638) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/EncDecA.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(35.65754396121206) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/Hybrid.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(31.39665078989411) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/PBMT-R.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(38.558843050332037) hyp_sents = read_lines(DATA_DIR / "system_outputs/turk/lower/SBMT-SARI.tok.low") sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents) assert sari_score == pytest.approx(39.964857928109127)
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer='13a', lowercase=True, metrics=DEFAULT_METRICS, analysis=False, quality_estimation=False, ): ''' Evaluate a system output with automatic metrics. ''' sys_sents = get_sys_sents(test_set, sys_sents_path) orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) # compute each metric metrics_scores = {} if 'bleu' in metrics: metrics_scores['bleu'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenizer=tokenizer, lowercase=lowercase) if 'sent_bleu' in metrics: metrics_scores['sent_bleu'] = corpus_averaged_sentence_bleu( sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'sari' in metrics: metrics_scores['sari'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'sari_legacy' in metrics: metrics_scores['sari_legacy'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, legacy=True) if 'samsa' in metrics: from easse.samsa import corpus_samsa metrics_scores['samsa'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase, verbose=True) if 'fkgl' in metrics: metrics_scores['fkgl'] = corpus_fkgl(sys_sents, tokenizer=tokenizer) if 'f1_token' in metrics: metrics_scores['f1_token'] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: metrics_scores['word_level_analysis'] = corpus_analyse_operations( orig_sents, sys_sents, refs_sents, verbose=False, as_str=True) if quality_estimation: metrics_scores['quality_estimation'] = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) return metrics_scores
def get_qualitative_examples_html(orig_sents, sys_sents, refs_sents): title_key_print = [ ('Randomly sampled simplifications', lambda c, s, refs: 0, lambda value: ''), ('Best simplifications according to SARI', lambda c, s, refs: -corpus_sari([c], [s], [refs]), lambda value: f'SARI={-value:.2f}'), ('Worst simplifications according to SARI', lambda c, s, refs: corpus_sari([c], [s], [refs]), lambda value: f'SARI={value:.2f}'), ('Simplifications with the most compression', lambda c, s, refs: get_compression_ratio(c, s), lambda value: f'compression_ratio={value:.2f}'), ('Simplifications with a high amount of paraphrasing', lambda c, s, refs: get_levenshtein_similarity(c, s) / get_compression_ratio(c, s), lambda value: f'levenshtein_similarity={value:.2f}'), ('Simplifications with the most sentence splits (if any)', lambda c, s, refs: -(count_sentences(s) - count_sentences(c)), lambda value: f'#sentence_splits={-value:.2f}'), ] def get_one_sample_html(orig_sent, sys_sent, ref_sents, sort_key, print_func): orig_sent, sys_sent, *ref_sents = [html.escape(sent) for sent in [orig_sent, sys_sent, *ref_sents]] doc = Doc() with doc.tag('div', klass='mb-2 p-1'): # Sort key with doc.tag('div', klass='text-muted small'): doc.asis(print_func(sort_key(orig_sent, sys_sent, ref_sents))) with doc.tag('div', klass='ml-2'): orig_sent_bold, sys_sent_bold = make_differing_words_bold(orig_sent, sys_sent, make_text_bold_html) # Source with doc.tag('div'): doc.asis(orig_sent_bold) # Prediction with doc.tag('div'): doc.asis(sys_sent_bold) # References collapse_id = get_random_html_id() with doc.tag('div', klass='position-relative'): with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'), klass='stretched-link small'): doc.text('References') with doc.tag('div', klass='collapse', id=collapse_id): for ref_sent in refs: _, ref_sent_bold = make_differing_words_bold(orig_sent, ref_sent, make_text_bold_html) with doc.tag('div', klass='text-muted'): doc.asis(ref_sent_bold) return doc.getvalue() doc = Doc() for title, sort_key, print_func in title_key_print: with doc.tag('div', klass='container-fluid mt-4 p-2 border'): collapse_id = get_random_html_id() with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}')): doc.line('h3', klass='m-2', text_content=title) # Now lets print the examples sample_generator = sorted( zip(orig_sents, sys_sents, zip(*refs_sents)), key=lambda args: sort_key(*args), ) # Samples displayed by default with doc.tag('div', klass='collapse', id=collapse_id): n_samples = 50 for i, (orig_sent, sys_sent, refs) in enumerate(sample_generator): if i >= n_samples: break doc.asis(get_one_sample_html(orig_sent, sys_sent, refs, sort_key, print_func)) return doc.getvalue()
def get_relative_sari(orig_sent, sys_sents, refs_sents, system_idx): saris = [corpus_sari([orig_sent], [sys_sent], refs_sents) for sys_sent in sys_sents] return saris[system_idx] / np.average(saris)
row = row.replace("\n", "").replace(",", " ").replace( "article: ", "").replace("ref: ", "").replace("dec: ", "").replace('"', '') pair += row + "," test_file.close() example_sentences.close() ############################### file = open(csv_file_name, 'r', encoding='utf-8') file2 = open(score_file_name, 'w', encoding='utf-8') for i, row in enumerate(file): if (i == 0): file2.write( "article,reference,decoded,rouge1,rouge2,rouge_L,sari\n") continue row = row.split("\n")[0] row = row.split(",") rough_score = rg1.get_scores(row[2], row[1]) sari_score = corpus_sari(orig_sents=[row[0]], sys_sents=[row[2]], refs_sents=[[row[1]]]) pair = row[0] + "," + row[1] + "," + row[2] + "," + str(rough_score[0]['rouge-1']['f']) + "," \ + str(rough_score[0]['rouge-2']['f']) + "," + \ str(rough_score[0]['rouge-l']['f']) + "," + str(sari_score) + "\n" file2.write(pair) print("score file with name", score_file_name, "written into disk") file2.close() file.close()
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer="13a", lowercase=True, metrics=DEFAULT_METRICS, analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ for metric in metrics: assert metric in VALID_METRICS, f'"{metric}" not a valid metric. Valid metrics: {VALID_METRICS}' sys_sents = get_sys_sents(test_set, sys_sents_path) orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) # compute each metric metrics_scores = {} if "bleu" in metrics: metrics_scores["bleu"] = corpus_bleu( sys_sents, refs_sents, force=True, tokenizer=tokenizer, lowercase=lowercase, ) if "sent_bleu" in metrics: metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu( sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if "sari" in metrics: metrics_scores["sari"] = corpus_sari( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, ) if "sari_legacy" in metrics: metrics_scores["sari_legacy"] = corpus_sari( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, legacy=True, ) if "sari_by_operation" in metrics: ( metrics_scores["sari_add"], metrics_scores["sari_keep"], metrics_scores["sari_del"], ) = get_corpus_sari_operation_scores( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, ) if "samsa" in metrics: from easse.samsa import corpus_samsa metrics_scores["samsa"] = corpus_samsa( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase, verbose=True, ) if "fkgl" in metrics: metrics_scores["fkgl"] = corpus_fkgl(sys_sents, tokenizer=tokenizer) if "f1_token" in metrics: metrics_scores["f1_token"] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if "bertscore" in metrics: from easse.bertscore import corpus_bertscore # Inline import to use EASSE without installing all dependencies ( metrics_scores["bertscore_precision"], metrics_scores["bertscore_recall"], metrics_scores["bertscore_f1"], ) = corpus_bertscore(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: from easse.annotation.word_level import WordOperationAnnotator # Inline import to use EASSE without installing all dependencies word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer, lowercase=lowercase, verbose=True) metrics_scores[ "word_level_analysis"] = word_operation_annotator.analyse_operations( orig_sents, sys_sents, refs_sents, as_str=True) if quality_estimation: metrics_scores["quality_estimation"] = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) return metrics_scores
def get_qualitative_html_examples(orig_sents, sys_sents, refs_sents): title_key_print = [ ('Randomly sampled simplifications', lambda c, s, refs: 0, lambda value: ''), ('Best simplifications according to SARI', lambda c, s, refs: -corpus_sari([c], [s], [refs]), lambda value: f'SARI={-value:.2f}'), ('Worst simplifications according to SARI', lambda c, s, refs: corpus_sari([c], [s], [refs]), lambda value: f'SARI={value:.2f}'), ('Simplifications with only one differing word', lambda c, s, refs: -(count_words(c) == count_words(s) == len(get_lcs(to_words(c), to_words(s))) + 1), lambda value: ''), ('Simplifications with the most compression', lambda c, s, refs: get_compression_ratio(c, s), lambda value: f'compression_ratio={value:.2f}'), ('Simplifications that are longer than the source', lambda c, s, refs: -get_compression_ratio(c, s), lambda value: f'compression_ratio={-value:.2f}'), ('Simplifications that paraphrase the source', lambda c, s, refs: get_levenshtein_similarity(c, s) / get_compression_ratio(c, s), lambda value: f'levenshtein_similarity={value:.2f}'), ('Simplifications that are the most similar to the source (excluding exact matches)', lambda c, s, refs: -get_levenshtein_similarity(c, s) * int(c != s), lambda value: f'levenshtein_similarity={-value:.2f}'), ('Simplifications with the most sentence splits (if there are any)', lambda c, s, refs: -count_sentence_splits(c, s), lambda value: f'nb_sentences_ratio={-value:.2f}'), ] def get_one_sample_html(orig_sent, sys_sent, ref_sents, sort_key, print_func): doc = Doc() with doc.tag('div', klass='mb-2 p-1'): # Sort key with doc.tag('div', klass='text-muted small'): doc.asis(print_func(sort_key(orig_sent, sys_sent, ref_sents))) with doc.tag('div', klass='ml-2'): orig_sent_bold, sys_sent_bold = make_differing_words_bold(orig_sent, sys_sent, make_text_bold_html) # Source with doc.tag('div'): doc.asis(orig_sent_bold) # Prediction with doc.tag('div'): doc.asis(sys_sent_bold) # References collapse_id = get_random_html_id() with doc.tag('div', klass='position-relative'): with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'), klass='stretched-link small'): doc.text('References') with doc.tag('div', klass='collapse', id=collapse_id): for ref_sent in refs: _, ref_sent_bold = make_differing_words_bold(orig_sent, ref_sent, make_text_bold_html) with doc.tag('div', klass='text-muted'): doc.asis(ref_sent_bold) return doc.getvalue() doc = Doc() for title, sort_key, print_func in title_key_print: # stretched-link needs position-relative with doc.tag('div', klass='container-fluid mt-4 p-2 position-relative border'): doc.line('h3', klass='m-2', text_content=title) # Make whole div clickable to collapse / uncollapse examples collapse_id = get_random_html_id() with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'), klass='stretched-link'): pass # doc.stag and doc.line don't seem to work with stretched-link # Now lets print the examples sample_generator = sorted( zip(orig_sents, sys_sents, zip(*refs_sents)), key=lambda args: sort_key(*args), ) # Samples displayed by default with doc.tag('div', klass='collapse show', id=collapse_id): n_samples = 10 for i, (orig_sent, sys_sent, refs) in enumerate(sample_generator): if i >= n_samples: break doc.asis(get_one_sample_html(orig_sent, sys_sent, refs, sort_key, print_func)) return doc.getvalue()
def evaluate_system_output( test_set, input_path=None, tokenizer='13a', metrics=','.join(VALID_METRICS), analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ if input_path is not None: sys_output = read_lines(input_path) else: # read the system output with click.get_text_stream('stdin', encoding='utf-8') as system_output_file: sys_output = system_output_file.read().splitlines() # get the metrics that need to be computed metrics = metrics.split(',') load_orig_sents = ('sari' in metrics) or ('samsa' in metrics) or analysis or quality_estimation load_refs_sents = ('sari' in metrics) or ('bleu' in metrics) or analysis # get the references from the test set if test_set in ['turk', 'turk_valid']: lowercase = False phase = 'test' if test_set == 'turk' else 'valid' if load_orig_sents: orig_sents = get_turk_orig_sents(phase=phase) if load_refs_sents: refs_sents = get_turk_refs_sents(phase=phase) if test_set in ['pwkp', 'pwkp_valid']: lowercase = True phase = 'test' if test_set == 'pwkp' else 'valid' if load_orig_sents: orig_sents = get_pwkp_orig_sents(phase=phase) if load_refs_sents: refs_sents = get_pwkp_refs_sents(phase=phase) if test_set == 'hsplit': sys_output = sys_output[:70] lowercase = True if load_orig_sents: orig_sents = get_hsplit_orig_sents() if load_refs_sents: refs_sents = get_hsplit_refs_sents() if load_orig_sents: assert len(sys_output) == len(orig_sents) if load_refs_sents: assert len(sys_output) == len(refs_sents[0]) # compute each metric if 'bleu' in metrics: bleu_score = sacrebleu.corpus_bleu(sys_output, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score click.echo(f'BLEU: {bleu_score:.2f}') if 'sari' in metrics: sari_score = corpus_sari(orig_sents, sys_output, refs_sents, tokenizer=tokenizer, lowercase=lowercase) click.echo(f'SARI: {sari_score:.2f}') if 'samsa' in metrics: samsa_score = corpus_samsa(orig_sents, sys_output, tokenizer=tokenizer, verbose=True, lowercase=lowercase) click.echo(f'SAMSA: {samsa_score:.2f}') if 'fkgl' in metrics: fkgl_score = corpus_fkgl(sys_output, tokenizer=tokenizer) click.echo(f'FKGL: {fkgl_score:.2f}') if analysis: word_level_analysis = corpus_analyse_operations(orig_sents, sys_output, refs_sents, verbose=False, as_str=True) click.echo(f'Word-level Analysis: {word_level_analysis}') if quality_estimation: quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_output, tokenizer=tokenizer, lowercase=lowercase ) quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()} click.echo(f'Quality estimation: {quality_estimation_scores}')