def main(data_folder: str, output_folder: str, model_folder: str) -> None:
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".txt")
    ]
    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))

    for filename in tqdm(iterable=filenames,
                         unit=" txt",
                         desc="anonymize cases"):
        with open(os.path.join(data_folder, filename), 'r') as input_f:
            sentences = tagger.predict(sentences=input_f.readlines(),
                                       mini_batch_size=32,
                                       verbose=False,
                                       use_tokenizer=tokenizer)
            case_name = filename.split('.')[0]
            page_html = render_ner_html(sentences,
                                        colors=colors,
                                        title=case_name)

            with open(os.path.join(output_folder, case_name + ".html"),
                      "w") as output:
                output.write(page_html)
def main(data_folder: str, model_folder: str, dev_size: float,
         nb_epochs: int) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp,
                                                     data_folder=data_folder,
                                                     dev_size=dev_size)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('fr'),
        FlairEmbeddings('fr-forward'),
        FlairEmbeddings('fr-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            use_crf=True,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner')

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_folder,
                  max_epochs=nb_epochs,
                  mini_batch_size=32,
                  embeddings_storage_mode="cpu",
                  checkpoint=False)
Exemple #3
0
def main(data_folder: str, model_folder: str, dev_size: float) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp,
                                                     data_folder=data_folder,
                                                     dev_size=dev_size)

    tagger: SequenceTagger = SequenceTagger.load(
        model=os.path.join(model_folder, 'best-model.pt'))

    test_results, _ = tagger.evaluate(data_loader=DataLoader(corpus.test,
                                                             batch_size=32,
                                                             num_workers=10),
                                      embeddings_storage_mode="cpu")
    print(test_results.detailed_results)

    sentences_original = (corpus.train.sentences + corpus.test.sentences)
    sentences_predict = copy.deepcopy(sentences_original)
    # clean tokens in case there is a bug
    for s in sentences_predict:
        for t in s:
            t.tags = {}

    _ = tagger.predict(sentences=sentences_predict,
                       mini_batch_size=32,
                       embedding_storage_mode="cpu",
                       verbose=True)

    entities_to_keep = [
        "PERS", "ADDRESS", "ORGANIZATION", "JUDGE_CLERK", "LAWYER"
    ]
    for index, (sentence_original, sentence_predict) \
            in enumerate(zip(sentences_original, sentences_predict)):  # type: int, (Sentence, Sentence)
        expected_entities_text = {
            f"{s.text} {s.tag}"
            for s in sentence_original.get_spans('ner')
            if s.tag in entities_to_keep
        }
        predicted_entities_text = {
            f"{s.text} {s.tag}"
            for s in sentence_predict.get_spans('ner')
            if s.tag in entities_to_keep if s.score > 0.8
        }

        diff_expected = expected_entities_text.difference(
            predicted_entities_text)
        diff_predicted = predicted_entities_text.difference(
            expected_entities_text)

        if (len(diff_predicted) > 0):  # (len(diff_expected) > 0) or
            print("------------")
            print(f"source {index}: [{sentence_original.to_plain_string()}]")
            print(f"expected missing: [{diff_expected}]")
            print(f"predicted missing: [{diff_predicted}]")
            print(
                f"common: [{set(predicted_entities_text).intersection(set(expected_entities_text))}]"
            )
def main(data_folder: str, model_folder: str, dev_size: float,
         entities_to_remove: List[str]) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp,
                                                     data_folder=data_folder,
                                                     dev_size=dev_size)
    # flair.device = torch.device('cpu')  # (4mn 28)
    tagger: SequenceTagger = SequenceTagger.load(
        model=os.path.join(model_folder, 'best-model.pt'))
    test_results, _ = tagger.evaluate(
        data_loader=DataLoader(corpus.test, batch_size=32))
    print(test_results.detailed_results)

    sentences_original = (corpus.train.sentences + corpus.test.sentences)
    sentences_predict = copy.deepcopy(sentences_original)
    # clean tokens in case there is a bug
    for s in sentences_predict:
        for t in s:
            t.tags = {}

    _ = tagger.predict(sentences=sentences_predict,
                       mini_batch_size=32,
                       embedding_storage_mode="none",
                       verbose=True)

    for index, (sentence_original, sentence_predict) \
            in enumerate(zip(sentences_original, sentences_predict)):  # type: int, (Sentence, Sentence)
        expected_entities_text = {
            f"{s.text} {s.tag}"
            for s in sentence_original.get_spans('ner')
            if s.tag not in entities_to_remove
        }
        predicted_entities_text = {
            f"{s.text} {s.tag}"
            for s in sentence_predict.get_spans('ner')
            if s.tag not in entities_to_remove
        }

        diff_expected = expected_entities_text.difference(
            predicted_entities_text)
        diff_predicted = predicted_entities_text.difference(
            expected_entities_text)

        if len(diff_predicted) > 0:  # (len(diff_expected) > 0) or
            print("------------")
            print(f"source {index}: [{sentence_original.to_plain_string()}]")
            print(f"expected missing: [{diff_expected}]")
            print(f"predicted missing: [{diff_predicted}]")
            print(
                f"common: [{set(predicted_entities_text).intersection(set(expected_entities_text))}]"
            )
Exemple #5
0
def main(
    data_folder: str, model_folder: str, dev_size: float, nb_segment: Optional[int], segment: Optional[int]
) -> None:
    nlp = spacy.blank(name="fr")
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_dev_corpus(
        spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment
    )
    print(corpus)
    # flair.device = torch.device('cpu')  # (4mn 28)
    tagger: SequenceTagger = SequenceTagger.load(model=os.path.join(model_folder, "best-model.pt"))
    test_results, _ = tagger.evaluate(sentences=corpus.dev, mini_batch_size=32)
    print(test_results.detailed_results)

    sentences_predict = copy.deepcopy(corpus.dev.sentences)
    # clean tokens in case there is a bug
    for s in sentences_predict:
        for t in s:
            t.tags = {}

    _ = tagger.predict(sentences=sentences_predict, mini_batch_size=32, embedding_storage_mode="none", verbose=True)

    def span_to_str(span: Span) -> str:
        start_token = span.tokens[0].idx
        end_token = span.tokens[len(span.tokens) - 1].idx
        token_position = f"{start_token}" if start_token == end_token else f"{start_token}-{end_token}"
        return f"{span.text} [{span.tag}] ({token_position})"

    for index, (sentence_original, sentence_predict) in enumerate(
        zip(corpus.dev, sentences_predict)
    ):  # type: int, (Sentence, Sentence)
        expected_entities_text = [span_to_str(span=s) for s in sentence_original.get_spans("ner")]
        predicted_entities_text = [span_to_str(span=s) for s in sentence_predict.get_spans("ner")]

        diff_expected = [i for i in expected_entities_text if i not in predicted_entities_text]
        diff_predicted = [i for i in predicted_entities_text if i not in expected_entities_text]
        common_expected_predicted = [i for i in predicted_entities_text if i in expected_entities_text]

        if len(diff_predicted) > 0:
            print("------------")
            print(f"source {index}: [{sentence_original.to_plain_string()}]")
            print(f"expected missing: [{diff_expected}]")
            print(f"predicted missing: [{diff_predicted}]")
            print(f"common: [{common_expected_predicted}]")
Exemple #6
0
def main(
    data_folder: str,
    model_folder: str,
    dev_size: float,
    nb_epochs: int,
    nb_segment: Optional[int],
    segment: Optional[int],
) -> None:
    nlp = spacy.blank(name="fr")
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_dev_corpus(
        spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment
    )
    tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings("fr"),
        FlairEmbeddings("fr-forward"),
        FlairEmbeddings("fr-backward"),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type="ner"
    )

    trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=False)

    # TODO optimize LR https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
    trainer.train(
        model_folder,
        max_epochs=nb_epochs,
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode="cpu",
        checkpoint=False,
    )
def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".xml")
    ]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML",
              desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(
                path=os.path.join(data_folder, filename),
                keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception(
            "No example loaded, causes: no cases in provided path or sample size is to high"
        )

    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True,
                       embedding_storage_mode="cpu")

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html)
from flair.data import Sentence, build_spacy_tokenizer
from flair.models import SequenceTagger
import sys
sys.path.append('../anonymisation-master')
from ner.model_factory import get_tokenizer
from resources.config_provider import get_config_default
from xml_extractions.extract_node_values import Paragraph, get_paragraph_from_file

random.seed(5)

tagger: SequenceTagger = SequenceTagger.load(
    'resources/flair_ner/ca/best-model.pt')

config_training = get_config_default()
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
tokenizer = build_spacy_tokenizer(nlp)

xml_train_path = "../similar_legal_case/data/jurica_original_xml/arrets-juri"  # config_training["xml_train_path"]
files = [
    os.path.join(path, name) for path, _, files in os.walk(xml_train_path)
    for name in files
]
random.shuffle(files)

with open("./resources/training_data/generated_annotations.txt",
          mode='w') as generated_text:
    with open("./resources/training_data/generated_annotations.ent",
              mode='w') as generated_entities:
        for filename in files:
            if filename.endswith(".xml"):
Exemple #9
0
def main(data_folder: str, model_path: Optional[str],
         output_model: Optional[str], dev_size: float, nb_epochs: int,
         print_diff: bool) -> None:
    nlp = get_empty_model(load_labels_for_training=True)
    if model_path is not None:
        nlp = nlp.from_disk(path=model_path)
        nlp.tokenizer = get_tokenizer(nlp)  # replace tokenizer
        nlp.begin_training()
        # ner = nlp.get_pipe("ner")
        # ner.model.learn_rate = 0.0001
    else:
        nlp.tokenizer = get_tokenizer(nlp)  # replace tokenizer
        nlp.begin_training()

    all_annotated_files: List[str] = [
        os.path.join(data_folder, filename)
        for filename in os.listdir(data_folder) if filename.endswith(".txt")
    ]
    random.shuffle(all_annotated_files)

    nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size)

    dev_file_names = all_annotated_files[0:nb_doc_dev_set]

    train_file_names = [
        file for file in all_annotated_files if file not in dev_file_names
    ]
    # train_file_names = ["./resources/training_data/generated_annotations.txt"] + train_file_names

    content_to_rate = load_content(txt_paths=train_file_names)
    content_to_rate_test = load_content(txt_paths=dev_file_names)

    print(
        f"nb PERS entities {sum([1 for _, offsets in content_to_rate for o in offsets if o.type == 'PERS'])}"
    )

    if model_path is not None:
        print("evaluation without fine tuning")
        spacy_evaluate(nlp, content_to_rate_test, print_diff)

    optimizer: Optimizer = nlp.resume_training()

    for epoch in range(nb_epochs):
        print(f"------- {epoch}  -------")
        random.shuffle(content_to_rate)
        losses = dict()
        batches = minibatch(content_to_rate, size=compounding(4., 16., 1.001))
        for batch_id, batch in enumerate(
                tqdm(iterable=batches, unit=" batches", desc="Training")):
            try:
                batch_gold = convert_batch_to_gold_dataset(model=nlp,
                                                           batch=batch)
                texts, manual_annotations = zip(
                    *batch_gold)  # type: List[str], List[GoldParse]
                nlp.update(texts,
                           manual_annotations,
                           drop=0.5,
                           losses=losses,
                           sgd=optimizer)

                if batch_id % 10000 == 0:
                    spacy_evaluate(model=nlp,
                                   dev=content_to_rate_test,
                                   print_diff=print_diff)
            except Exception as e:
                print(f"got exception [{e}] on batch id {batch_id}")

        print(f"Epoch {epoch + 1}\nLoss: {losses}\n")
        spacy_evaluate(model=nlp,
                       dev=content_to_rate_test,
                       print_diff=print_diff)

    if output_model is not None:
        nlp.to_disk(output_model)