def main(data_folder: str, output_folder: str, model_folder: str) -> None: nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".txt") ] tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"): with open(os.path.join(data_folder, filename), 'r') as input_f: sentences = tagger.predict(sentences=input_f.readlines(), mini_batch_size=32, verbose=False, use_tokenizer=tokenizer) case_name = filename.split('.')[0] page_html = render_ner_html(sentences, colors=colors, title=case_name) with open(os.path.join(output_folder, case_name + ".html"), "w") as output: output.write(page_html)
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type='ner') trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_folder, max_epochs=nb_epochs, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False)
def main(data_folder: str, model_folder: str, dev_size: float) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) tagger: SequenceTagger = SequenceTagger.load( model=os.path.join(model_folder, 'best-model.pt')) test_results, _ = tagger.evaluate(data_loader=DataLoader(corpus.test, batch_size=32, num_workers=10), embeddings_storage_mode="cpu") print(test_results.detailed_results) sentences_original = (corpus.train.sentences + corpus.test.sentences) sentences_predict = copy.deepcopy(sentences_original) # clean tokens in case there is a bug for s in sentences_predict: for t in s: t.tags = {} _ = tagger.predict(sentences=sentences_predict, mini_batch_size=32, embedding_storage_mode="cpu", verbose=True) entities_to_keep = [ "PERS", "ADDRESS", "ORGANIZATION", "JUDGE_CLERK", "LAWYER" ] for index, (sentence_original, sentence_predict) \ in enumerate(zip(sentences_original, sentences_predict)): # type: int, (Sentence, Sentence) expected_entities_text = { f"{s.text} {s.tag}" for s in sentence_original.get_spans('ner') if s.tag in entities_to_keep } predicted_entities_text = { f"{s.text} {s.tag}" for s in sentence_predict.get_spans('ner') if s.tag in entities_to_keep if s.score > 0.8 } diff_expected = expected_entities_text.difference( predicted_entities_text) diff_predicted = predicted_entities_text.difference( expected_entities_text) if (len(diff_predicted) > 0): # (len(diff_expected) > 0) or print("------------") print(f"source {index}: [{sentence_original.to_plain_string()}]") print(f"expected missing: [{diff_expected}]") print(f"predicted missing: [{diff_predicted}]") print( f"common: [{set(predicted_entities_text).intersection(set(expected_entities_text))}]" )
def main(data_folder: str, model_folder: str, dev_size: float, entities_to_remove: List[str]) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) # flair.device = torch.device('cpu') # (4mn 28) tagger: SequenceTagger = SequenceTagger.load( model=os.path.join(model_folder, 'best-model.pt')) test_results, _ = tagger.evaluate( data_loader=DataLoader(corpus.test, batch_size=32)) print(test_results.detailed_results) sentences_original = (corpus.train.sentences + corpus.test.sentences) sentences_predict = copy.deepcopy(sentences_original) # clean tokens in case there is a bug for s in sentences_predict: for t in s: t.tags = {} _ = tagger.predict(sentences=sentences_predict, mini_batch_size=32, embedding_storage_mode="none", verbose=True) for index, (sentence_original, sentence_predict) \ in enumerate(zip(sentences_original, sentences_predict)): # type: int, (Sentence, Sentence) expected_entities_text = { f"{s.text} {s.tag}" for s in sentence_original.get_spans('ner') if s.tag not in entities_to_remove } predicted_entities_text = { f"{s.text} {s.tag}" for s in sentence_predict.get_spans('ner') if s.tag not in entities_to_remove } diff_expected = expected_entities_text.difference( predicted_entities_text) diff_predicted = predicted_entities_text.difference( expected_entities_text) if len(diff_predicted) > 0: # (len(diff_expected) > 0) or print("------------") print(f"source {index}: [{sentence_original.to_plain_string()}]") print(f"expected missing: [{diff_expected}]") print(f"predicted missing: [{diff_predicted}]") print( f"common: [{set(predicted_entities_text).intersection(set(expected_entities_text))}]" )
def main( data_folder: str, model_folder: str, dev_size: float, nb_segment: Optional[int], segment: Optional[int] ) -> None: nlp = spacy.blank(name="fr") nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_dev_corpus( spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment ) print(corpus) # flair.device = torch.device('cpu') # (4mn 28) tagger: SequenceTagger = SequenceTagger.load(model=os.path.join(model_folder, "best-model.pt")) test_results, _ = tagger.evaluate(sentences=corpus.dev, mini_batch_size=32) print(test_results.detailed_results) sentences_predict = copy.deepcopy(corpus.dev.sentences) # clean tokens in case there is a bug for s in sentences_predict: for t in s: t.tags = {} _ = tagger.predict(sentences=sentences_predict, mini_batch_size=32, embedding_storage_mode="none", verbose=True) def span_to_str(span: Span) -> str: start_token = span.tokens[0].idx end_token = span.tokens[len(span.tokens) - 1].idx token_position = f"{start_token}" if start_token == end_token else f"{start_token}-{end_token}" return f"{span.text} [{span.tag}] ({token_position})" for index, (sentence_original, sentence_predict) in enumerate( zip(corpus.dev, sentences_predict) ): # type: int, (Sentence, Sentence) expected_entities_text = [span_to_str(span=s) for s in sentence_original.get_spans("ner")] predicted_entities_text = [span_to_str(span=s) for s in sentence_predict.get_spans("ner")] diff_expected = [i for i in expected_entities_text if i not in predicted_entities_text] diff_predicted = [i for i in predicted_entities_text if i not in expected_entities_text] common_expected_predicted = [i for i in predicted_entities_text if i in expected_entities_text] if len(diff_predicted) > 0: print("------------") print(f"source {index}: [{sentence_original.to_plain_string()}]") print(f"expected missing: [{diff_expected}]") print(f"predicted missing: [{diff_predicted}]") print(f"common: [{common_expected_predicted}]")
def main( data_folder: str, model_folder: str, dev_size: float, nb_epochs: int, nb_segment: Optional[int], segment: Optional[int], ) -> None: nlp = spacy.blank(name="fr") nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_dev_corpus( spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment ) tag_dictionary = corpus.make_tag_dictionary(tag_type="ner") print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings("fr"), FlairEmbeddings("fr-forward"), FlairEmbeddings("fr-backward"), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type="ner" ) trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=False) # TODO optimize LR https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md trainer.train( model_folder, max_epochs=nb_epochs, learning_rate=0.1, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False, )
def main(data_folder: str, model_folder: str, top_n: int) -> None: print(f"keep only top {top_n} examples per file") nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".xml") ] sentences: List[Sentence] = list() with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar: for filename in filenames: paragraphs: List[Paragraph] = get_paragraph_from_file( path=os.path.join(data_folder, filename), keep_paragraph_without_annotation=True) if len(paragraphs) > top_n: for paragraph in paragraphs[:top_n]: if len(paragraph.text) > 0: s = Sentence(text=paragraph.text, tokenizer=tokenizer) sentences.append(s) progress_bar.update() if len(sentences) == 0: raise Exception( "No example loaded, causes: no cases in provided path or sample size is to high" ) tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) _ = tagger.predict(sentences=sentences, mini_batch_size=32, verbose=True, embedding_storage_mode="cpu") print("prepare html") page_html = render_ner_html(sentences, colors=colors) print("write html") with open("sentence.html", "w") as writer: writer.write(page_html)
from flair.data import Sentence, build_spacy_tokenizer from flair.models import SequenceTagger import sys sys.path.append('../anonymisation-master') from ner.model_factory import get_tokenizer from resources.config_provider import get_config_default from xml_extractions.extract_node_values import Paragraph, get_paragraph_from_file random.seed(5) tagger: SequenceTagger = SequenceTagger.load( 'resources/flair_ner/ca/best-model.pt') config_training = get_config_default() nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) xml_train_path = "../similar_legal_case/data/jurica_original_xml/arrets-juri" # config_training["xml_train_path"] files = [ os.path.join(path, name) for path, _, files in os.walk(xml_train_path) for name in files ] random.shuffle(files) with open("./resources/training_data/generated_annotations.txt", mode='w') as generated_text: with open("./resources/training_data/generated_annotations.ent", mode='w') as generated_entities: for filename in files: if filename.endswith(".xml"):
def main(data_folder: str, model_path: Optional[str], output_model: Optional[str], dev_size: float, nb_epochs: int, print_diff: bool) -> None: nlp = get_empty_model(load_labels_for_training=True) if model_path is not None: nlp = nlp.from_disk(path=model_path) nlp.tokenizer = get_tokenizer(nlp) # replace tokenizer nlp.begin_training() # ner = nlp.get_pipe("ner") # ner.model.learn_rate = 0.0001 else: nlp.tokenizer = get_tokenizer(nlp) # replace tokenizer nlp.begin_training() all_annotated_files: List[str] = [ os.path.join(data_folder, filename) for filename in os.listdir(data_folder) if filename.endswith(".txt") ] random.shuffle(all_annotated_files) nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size) dev_file_names = all_annotated_files[0:nb_doc_dev_set] train_file_names = [ file for file in all_annotated_files if file not in dev_file_names ] # train_file_names = ["./resources/training_data/generated_annotations.txt"] + train_file_names content_to_rate = load_content(txt_paths=train_file_names) content_to_rate_test = load_content(txt_paths=dev_file_names) print( f"nb PERS entities {sum([1 for _, offsets in content_to_rate for o in offsets if o.type == 'PERS'])}" ) if model_path is not None: print("evaluation without fine tuning") spacy_evaluate(nlp, content_to_rate_test, print_diff) optimizer: Optimizer = nlp.resume_training() for epoch in range(nb_epochs): print(f"------- {epoch} -------") random.shuffle(content_to_rate) losses = dict() batches = minibatch(content_to_rate, size=compounding(4., 16., 1.001)) for batch_id, batch in enumerate( tqdm(iterable=batches, unit=" batches", desc="Training")): try: batch_gold = convert_batch_to_gold_dataset(model=nlp, batch=batch) texts, manual_annotations = zip( *batch_gold) # type: List[str], List[GoldParse] nlp.update(texts, manual_annotations, drop=0.5, losses=losses, sgd=optimizer) if batch_id % 10000 == 0: spacy_evaluate(model=nlp, dev=content_to_rate_test, print_diff=print_diff) except Exception as e: print(f"got exception [{e}] on batch id {batch_id}") print(f"Epoch {epoch + 1}\nLoss: {losses}\n") spacy_evaluate(model=nlp, dev=content_to_rate_test, print_diff=print_diff) if output_model is not None: nlp.to_disk(output_model)