def visualize_ner_tags(self, display_index=range(5), save_display_html: bool = False, save_all_html: bool = True, **kwargs): html = render_ner_html([self.sentences[i] for i in display_index], **kwargs) display(HTML(html)) if save_display_html: (self.path / 'sentences_true_example.html').write_text(html) if save_all_html: html = render_ner_html(self.sentences, **kwargs) (self.path / 'sentences_true_all.html').write_text(html)
def test_html_rendering(): text = ("Boris Johnson has been elected new Conservative leader in " "a ballot of party members and will become the " "next UK prime minister. &") sentence = Sentence(text) print(sentence[0:2].add_label("ner", "PER")) print(sentence[6:7].add_label("ner", "MISC")) print(sentence[19:20].add_label("ner", "LOC")) colors = { "PER": "#F7FF53", "ORG": "#E8902E", "LOC": "yellow", "MISC": "#4647EB", "O": "#ddd", } actual = render_ner_html([sentence], colors=colors) expected_res = HTML_PAGE.format( text=PARAGRAPH.format( sentence=TAGGED_ENTITY.format( color="#F7FF53", entity="Boris Johnson", label="PER") + " has been elected new " + TAGGED_ENTITY.format( color="#4647EB", entity="Conservative", label="MISC") + " leader in a ballot of party members and will become the next " + TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC") + " prime minister. &"), title="Flair", ) assert expected_res == actual
def test_html_rendering(): text = ( "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the " "next UK prime minister. &") sent = Sentence() sent.get_spans = MagicMock() sent.get_spans.return_value = [ mock_ner_span(text, "PER", 0, 13), mock_ner_span(text, "MISC", 35, 47), mock_ner_span(text, "LOC", 109, 111), ] sent.to_original_text = MagicMock() sent.to_original_text.return_value = text colors = { "PER": "#F7FF53", "ORG": "#E8902E", "LOC": "yellow", "MISC": "#4647EB", "O": "#ddd", } actual = render_ner_html([sent], colors=colors) expected_res = HTML_PAGE.format( text=PARAGRAPH.format( sentence=TAGGED_ENTITY.format( color="#F7FF53", entity="Boris Johnson", label="PER") + " has been elected new " + TAGGED_ENTITY.format( color="#4647EB", entity="Conservative", label="MISC") + " leader in a ballot of party members and will become the next " + TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC") + " prime minister. &"), title="Flair", ) assert expected_res == actual
def main(data_folder: str, output_folder: str, model_folder: str) -> None: nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".txt") ] tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"): with open(os.path.join(data_folder, filename), 'r') as input_f: sentences = tagger.predict(sentences=input_f.readlines(), mini_batch_size=32, verbose=False, use_tokenizer=tokenizer) case_name = filename.split('.')[0] page_html = render_ner_html(sentences, colors=colors, title=case_name) with open(os.path.join(output_folder, case_name + ".html"), "w") as output: output.write(page_html)
def predict(self, sentences: Union[str, Sentence, List[Sentence], List[str]], display_html: bool = True, html_file: str = None, display_str: bool = False, **kwargs): if type(sentences) == Sentence: sentences = [sentences] elif type(sentences) == str: sentences = split_single(sentences) if type(sentences[0]) == str: sentences = [Sentence(s, use_tokenizer=True) for s in sentences] self.model.predict(sentences) if display_html or html_file: html = render_ner_html(sentences, **kwargs) if display_html: display(HTML(html)) if html_file: (self.path / html_file).write_text(html) if display_str: for sentence in sentences: print(sentence.to_tagged_string())
def main(data_folder: str, model_folder: str, top_n: int) -> None: print(f"keep only top {top_n} examples per file") nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".xml") ] sentences: List[Sentence] = list() with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar: for filename in filenames: paragraphs: List[Paragraph] = get_paragraph_from_file( path=os.path.join(data_folder, filename), keep_paragraph_without_annotation=True) if len(paragraphs) > top_n: for paragraph in paragraphs[:top_n]: if len(paragraph.text) > 0: s = Sentence(text=paragraph.text, tokenizer=tokenizer) sentences.append(s) progress_bar.update() if len(sentences) == 0: raise Exception( "No example loaded, causes: no cases in provided path or sample size is to high" ) tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) _ = tagger.predict(sentences=sentences, mini_batch_size=32, verbose=True, embedding_storage_mode="cpu") print("prepare html") page_html = render_ner_html(sentences, colors=colors) print("write html") with open("sentence.html", "w") as writer: writer.write(page_html)
def predict_flair(model, text): manual_sentence = Sentence(manual_user_input) model.predict(manual_sentence) return render_ner_html(manual_sentence, colors=colors, wrap_page=False)
# not the first one, put empty word.text = "" inside_parenthesis = True elif tag != "O" and not any([True for i in to_skip if i in tag]): inside_parenthesis = False if word.text.lower() not in replacement_dict: replacement_dict[word.text.lower()] = pseudo[len(replacement_dict)] word.text = replacement_dict[word.text.lower()] else: inside_parenthesis = False colors = { "ETABLISSEMENT": "#35c2b2", "ADDRESS": "#FFAE62", "ORGANIZATION": "#FFB990", "SITE": "#ff8800", "HOPITAL": "#edddcb", "MEDIA": "#e966c4", "MAIL": "#1688cb", "ETAT": "#00c5ed", "RESIDENCE": "#94bce1", "PERSONNE_DE_JUSTICE": "#89B2C4", "GROUPE": "#9cae64", "DATE": "#F9E17D", "NUMEROS": "#F8485E", "PERS": "#FA7268", "FONDS": "#C3FF1F", } st.write(render_ner_html(sentences=paragraphs, colors=colors, wrap_page=False), unsafe_allow_html=True)