def make_differing_words_bold(orig_sent, sys_sent, make_bold): '''Returns the two sentences with differing words in bold''' def format_words(words, mutual_words): '''Makes all words bold except the mutual ones''' words_generator = iter(words) formatted_string = '' for mutual_word in mutual_words: word = next(words_generator) bold_text = '' while word != mutual_word: bold_text += ' ' + word word = next(words_generator) if bold_text != '': formatted_string += ' ' + make_bold(bold_text) formatted_string += ' ' + word # Add remaining words formatted_string += ' ' + make_bold(' '.join(words_generator)) return formatted_string.strip() orig_words = to_words(orig_sent) sys_words = to_words(sys_sent) mutual_words = get_lcs(orig_words, sys_words) return format_words(orig_words, mutual_words), format_words(sys_words, mutual_words)
def truncate(sentence): # Take first 80% words words = to_words(sentence) return ' '.join(words[:int(len(words) * 0.8)]) + '.'
def get_qualitative_html_examples(orig_sents, sys_sents, refs_sents): title_key_print = [ ('Randomly sampled simplifications', lambda c, s, refs: 0, lambda value: ''), ('Best simplifications according to SARI', lambda c, s, refs: -corpus_sari([c], [s], [refs]), lambda value: f'SARI={-value:.2f}'), ('Worst simplifications according to SARI', lambda c, s, refs: corpus_sari([c], [s], [refs]), lambda value: f'SARI={value:.2f}'), ('Simplifications with only one differing word', lambda c, s, refs: -(count_words(c) == count_words(s) == len(get_lcs(to_words(c), to_words(s))) + 1), lambda value: ''), ('Simplifications with the most compression', lambda c, s, refs: get_compression_ratio(c, s), lambda value: f'compression_ratio={value:.2f}'), ('Simplifications that are longer than the source', lambda c, s, refs: -get_compression_ratio(c, s), lambda value: f'compression_ratio={-value:.2f}'), ('Simplifications that paraphrase the source', lambda c, s, refs: get_levenshtein_similarity(c, s) / get_compression_ratio(c, s), lambda value: f'levenshtein_similarity={value:.2f}'), ('Simplifications that are the most similar to the source (excluding exact matches)', lambda c, s, refs: -get_levenshtein_similarity(c, s) * int(c != s), lambda value: f'levenshtein_similarity={-value:.2f}'), ('Simplifications with the most sentence splits (if there are any)', lambda c, s, refs: -count_sentence_splits(c, s), lambda value: f'nb_sentences_ratio={-value:.2f}'), ] def get_one_sample_html(orig_sent, sys_sent, ref_sents, sort_key, print_func): doc = Doc() with doc.tag('div', klass='mb-2 p-1'): # Sort key with doc.tag('div', klass='text-muted small'): doc.asis(print_func(sort_key(orig_sent, sys_sent, ref_sents))) with doc.tag('div', klass='ml-2'): orig_sent_bold, sys_sent_bold = make_differing_words_bold(orig_sent, sys_sent, make_text_bold_html) # Source with doc.tag('div'): doc.asis(orig_sent_bold) # Prediction with doc.tag('div'): doc.asis(sys_sent_bold) # References collapse_id = get_random_html_id() with doc.tag('div', klass='position-relative'): with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'), klass='stretched-link small'): doc.text('References') with doc.tag('div', klass='collapse', id=collapse_id): for ref_sent in refs: _, ref_sent_bold = make_differing_words_bold(orig_sent, ref_sent, make_text_bold_html) with doc.tag('div', klass='text-muted'): doc.asis(ref_sent_bold) return doc.getvalue() doc = Doc() for title, sort_key, print_func in title_key_print: # stretched-link needs position-relative with doc.tag('div', klass='container-fluid mt-4 p-2 position-relative border'): doc.line('h3', klass='m-2', text_content=title) # Make whole div clickable to collapse / uncollapse examples collapse_id = get_random_html_id() with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'), klass='stretched-link'): pass # doc.stag and doc.line don't seem to work with stretched-link # Now lets print the examples sample_generator = sorted( zip(orig_sents, sys_sents, zip(*refs_sents)), key=lambda args: sort_key(*args), ) # Samples displayed by default with doc.tag('div', klass='collapse show', id=collapse_id): n_samples = 10 for i, (orig_sent, sys_sent, refs) in enumerate(sample_generator): if i >= n_samples: break doc.asis(get_one_sample_html(orig_sent, sys_sent, refs, sort_key, print_func)) return doc.getvalue()