def test_standoff_to_flair_sents():
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
    tokenizer = TokenizerFactory().tokenizer('ons')
    docs = corpus.train
    sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer)

    assert len(sents) == 14
    assert len(parsed_docs) == 14

    bio_tags = [token.get_tag('ner').value for token in sents[0]]
    token_texts = [token.text for token in sents[0]]
    assert token_texts == ['Linders', ',', 'Xandro', '<']
    assert bio_tags == ['B-Name', 'I-Name', 'I-Name', 'O']

    bio_tags = [token.get_tag('ner').value for token in sents[1]]
    token_texts = [token.text for token in sents[1]]
    assert token_texts == ['*****@*****.**']
    assert bio_tags == ['B-Email']

    bio_tags = [token.get_tag('ner').value for token in sents[2]]
    token_texts = [token.text for token in sents[2]]
    assert token_texts == ['>', '<SPACE>', '07', 'apr', '.', '<SPACE>']
    assert bio_tags == [
        'O',
        'O',
        'B-Date',
        'I-Date',
        'O',
        'O',
    ]
def test_flair_sentence_with_whitespace_tokens():
    text = 'Mw geniet zichtbaar.  Maarten is de afgelopen periode veelal afwezig.'
    annotation = Annotation(text='Maarten',
                            start=text.index('Maarten'),
                            end=text.index('Maarten') + len('Maarten'),
                            tag='PERSON')
    doc = Document(name='', text=text, annotations=[annotation])

    tokenizer = TokenizerFactory().tokenizer('ons')
    flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer)

    # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained
    # in the Flair sentence, otherwise it's not possible to reconstruct the original document from
    # the tokenized representation.
    assert [token.text for token in flair_sents[0]
            ] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>']

    spacy_doc = docs[0].spacy_doc
    spacy_sents = list(spacy_doc.sents)
    assert len(flair_sents) == 2
    assert len(spacy_sents) == 2

    assert len(flair_sents[0]) == 5
    assert len(spacy_sents[0]) == 5
    assert len(flair_sents[1]) == 8
    assert len(spacy_sents[1]) == 8
Exemple #3
0
def main(args, model_dir):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    logger.info('Get sentences...')
    train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True)
    dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True)
    test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test,
                                                                tokenizer, verbose=True)

    train_sents = train_sents + dev_sents
    train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents))

    sample_size = int(len(train_sents_filtered) * args.train_sample_frac)
    rs = RandomState(seed=args.random_seed)
    train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist()
    logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)',
                sample_size, len(train_sents_filtered), args.train_sample_frac)

    # We need to pass some dev data, otherwise flair raises a ZeroDivisionError
    # See: https://github.com/zalandoresearch/flair/issues/1139
    # We just split the training sample into half and instruct Flair to train_with_dev (see below).
    half = len(train_sents_sample) // 2
    flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half],
                                              dev=train_sents_sample[half:],
                                              test=test_sents,
                                              ignore_sentence=_ignore_sentence)
    logger.info(flair_corpus)

    logger.info('Train model...')
    tagger = run_bilstmcrf.get_model(flair_corpus,
                                     corpus_name=args.corpus,
                                     embedding_lang=args.embedding_lang,
                                     pooled_contextual_embeddings=True)

    trainer = ModelTrainer(tagger, flair_corpus)
    trainer.train(join(model_dir, 'flair'),
                  max_epochs=150,
                  monitor_train=False,
                  train_with_dev=True,
                  save_final_model=args.save_final_model)

    logger.info('Make predictions...')
    run_bilstmcrf.make_predictions(tagger, flair_corpus)

    logger.info('Start evaluation...')
    evaluator = Evaluator(gold=corpus.test,
                          predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs))

    entity_level_metric = evaluator.entity_level()
    logger.info('\n{}', entity_level_metric)
    entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv'))
    evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv'))
    evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv'))
    logger.info('Done.')
def main(args, model_dir):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    logger.info('Get sentences...')
    train_sents, _ = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True)
    dev_sents, _ = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True)
    test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True)

    train_sents = train_sents + dev_sents
    train_sents_filtered = list(filter(_is_not_meta_sentence, train_sents))

    sample_size = int(len(train_sents_filtered) * args.train_sample_frac)
    rs = RandomState(seed=args.random_seed)
    train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist()
    logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)',
                sample_size, len(train_sents_filtered), args.train_sample_frac)

    logger.info('Compute features...')
    feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[args.feature_extractor]
    X_train, y_train = crf_labeler.sents_to_features_and_labels(train_sents_sample,
                                                                feature_extractor)
    X_test, _ = crf_labeler.sents_to_features_and_labels(test_sents, feature_extractor)

    logger.info('len(X_train) = {}'.format(len(X_train)))
    logger.info('len(y_train) = {}'.format(len(y_train)))
    logger.info('len(X_test) = {}'.format(len(X_test)))

    crf = crf_labeler.SentenceFilterCRF(
        ignore_sentence=meta_sentence_filter,
        ignored_label='O',
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    logger.info('Start training... {}'.format(crf))
    crf.fit(X_train, y_train)

    logger.info('CRF classes: {}'.format(crf.classes_))

    logger.info('Make predictions...')
    y_pred_test = crf.predict(X_test)

    logger.info('Start evaluation...')
    evaluator = Evaluator(gold=corpus.test,
                          predicted=tagging_utils.sents_to_standoff(y_pred_test, test_docs))

    entity_level_metric = evaluator.entity_level()
    logger.info('\n{}', entity_level_metric)
    entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv'))
    evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv'))
    evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv'))
    logger.info('Done.')
def main(args):
    logger.info('Load data...')
    documents, num_tokens = load_data()

    logger.info('Initialize taggers...')
    tokenizer_crf = TokenizerFactory().tokenizer(corpus='ons', disable=())
    tokenizer_bilstm = TokenizerFactory().tokenizer(corpus='ons',
                                                    disable=("tagger", "ner"))

    taggers = [('DEDUCE', DeduceTagger(verbose=True)),
               ('CRF',
                CRFTagger(model='model_crf_ons_tuned-v0.1.0',
                          tokenizer=tokenizer_crf,
                          verbose=True)),
               ('BiLSTM-CRF (large)',
                FlairTagger(model='model_bilstmcrf_ons_large-v0.1.0',
                            tokenizer=tokenizer_bilstm,
                            mini_batch_size=args.bilstmcrf_large_batch_size,
                            verbose=True)),
               ('BiLSTM-CRF (fast)',
                FlairTagger(model='model_bilstmcrf_ons_fast-v0.1.0',
                            tokenizer=tokenizer_bilstm,
                            mini_batch_size=args.bilstmcrf_fast_batch_size,
                            verbose=True))]

    benchmark_results = []
    tagger_names = []
    for tagger_name, tagger in taggers:
        logger.info(f'Benchmark inference for tagger: {tagger_name}')
        scores = benchmark_tagger(tagger, documents, num_tokens)
        benchmark_results.append(scores)
        tagger_names.append(tagger_name)

    df = pd.DataFrame(data=benchmark_results, index=tagger_names)
    df.to_csv(f'{args.benchmark_name}.csv')
    logger.info('\n{}', df)
Exemple #6
0
def main(args):
    corpus_loader = CorpusLoader()
    corpus = corpus_loader.load_corpus(CORPUS_PATH[args.corpus])
    logger.info(corpus)

    df_train = annotations_to_pandas(corpus.train)
    df_train['part'] = 'train'
    df_dev = annotations_to_pandas(corpus.dev)
    df_dev['part'] = 'dev'
    df_test = annotations_to_pandas(corpus.test)
    df_test['part'] = 'test'
    df_all = pd.concat([df_train, df_dev, df_test])

    df_absolute = df_all \
        .groupby(['tag', 'part']) \
        .size() \
        .unstack() \
        .sort_values(by='train', ascending=False) \
        .fillna(0) \
        .astype(int)
    df_absolute = df_absolute.add_suffix('_phi')

    df_normalized = df_absolute / df_absolute.sum()
    df_normalized = df_normalized.add_suffix('_normalized')
    df_normalized = df_normalized.fillna(0)

    df_phi_stats = pd.concat([df_absolute, df_normalized], axis=1)
    df_phi_stats['all_phi'] = df_all.tag.value_counts()
    df_phi_stats['all_phi_normalized'] = df_all.tag.value_counts(normalize=True)

    df_phi_stats.round(3).sort_values('all_phi', ascending=False).to_csv(
        join(OUT_DIR, 'phi_distribution.csv'))

    with open(join(OUT_DIR, 'phi_summary.txt'), 'w') as f:
        print('All PHI: {}'.format(len(df_all)), file=f)
        print('PHI by data split:', file=f)
        print(df_phi_stats[['dev_phi', 'test_phi', 'train_phi']].sum(), file=f)

    plot_phi_distribution(df_phi_stats, num_total=len(df_all))

    all_documents = corpus.train + corpus.dev + corpus.test
    tokenizer = TokenizerFactory().tokenizer(args.corpus, disable=['tagger', 'ner'])
    doc_stats = analyze_documents(all_documents, tokenizer)

    with open(join(OUT_DIR, 'corpus_statistics.txt'), 'w') as f:
        write_doc_stats(doc_stats, file=f)
Exemple #7
0
def main(args):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    model_dir = train_utils.model_dir(corpus.name, args.run_id)
    os.makedirs(model_dir, exist_ok=True)

    logger.info('Get sentences...')
    docs = list(itertools.chain(corpus.train, corpus.dev))
    sents, _ = tagging_utils.standoff_to_sents(docs, tokenizer, verbose=True)

    logger.info('Compute features...')
    feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[
        args.feature_extractor]
    X, y = crf_labeler.sents_to_features_and_labels(sents, feature_extractor)

    logger.info('len(X) = {}'.format(len(X)))
    logger.info('len(y) = {}'.format(len(y)))

    crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter,
                                        ignored_label='O',
                                        algorithm='lbfgs',
                                        c1=0.1,
                                        c2=0.1,
                                        max_iterations=100,
                                        all_possible_transitions=True)

    logger.info('Start learing curve computation...')
    with parallel_backend('multiprocessing'):
        # scikit-learn changed the default multiprocessing to 'loky' in 0.21. It appears that this
        # is not supported by sklearn_crfsuite. Therefore, we switch to the legacy 'multiprocessing'
        # parallel backend.
        plot_learning_curve(crf,
                            'CRF learning curve (sentences: N={})'.format(
                                len(X)),
                            X,
                            y,
                            out_dir=model_dir,
                            cv=5,
                            n_jobs=12)
    logger.info('Done...')
Exemple #8
0
def test_flair_sents_to_standoff():
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
    tokenizer = TokenizerFactory().tokenizer('ons')
    docs_expected = corpus.train

    sents, parsed_docs = flair_utils.standoff_to_flair_sents(
        docs_expected, tokenizer)
    docs_actual = flair_utils.flair_sents_to_standoff(sents, parsed_docs)

    assert len(docs_actual) == 1
    assert len(docs_expected) == 1

    assert len(docs_actual[0].annotations) == 16
    assert len(docs_expected[0].annotations) == 16

    for ann_expected, ann_actual in zip(docs_expected[0].annotations,
                                        docs_actual[0].annotations):
        assert ann_expected.text == ann_actual.text
        assert ann_expected.tag == ann_actual.tag
def main(args):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    model_dir = train_utils.model_dir(corpus.name, args.run_id)
    os.makedirs(model_dir, exist_ok=True)

    logger.info('Get sentences...')
    train_sents, train_docs = tagging_utils.standoff_to_sents(corpus.train,
                                                              tokenizer,
                                                              verbose=True)
    dev_sents, dev_docs = tagging_utils.standoff_to_sents(corpus.dev,
                                                          tokenizer,
                                                          verbose=True)
    test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test,
                                                            tokenizer,
                                                            verbose=True)

    logger.info('Compute features...')
    feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[
        args.feature_extractor]
    X_train, y_train = crf_labeler.sents_to_features_and_labels(
        train_sents, feature_extractor)
    X_dev, y_dev = crf_labeler.sents_to_features_and_labels(
        dev_sents, feature_extractor)
    X_test, y_test = crf_labeler.sents_to_features_and_labels(
        test_sents, feature_extractor)

    logger.info('len(X_train) = {}'.format(len(X_train)))
    logger.info('len(y_train) = {}'.format(len(y_train)))
    logger.info('len(X_dev) = {}'.format(len(X_dev)))
    logger.info('len(X_test) = {}'.format(len(X_test)))

    X_train_combined = X_train + X_dev
    y_train_combined = y_train + y_dev

    train_indices = [-1] * len(X_train)
    dev_indices = [0] * len(X_dev)
    test_fold = train_indices + dev_indices

    labels = list(set(label for sent in y_train_combined for label in sent))
    labels.remove('O')
    logger.info('Labels: {}'.format(labels))
    f1_scorer = make_scorer(flat_f1_score, labels=labels, average='micro')

    crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter,
                                        ignored_label='O',
                                        algorithm='lbfgs',
                                        max_iterations=100,
                                        all_possible_transitions=True)

    ps = PredefinedSplit(test_fold)
    rs = RandomizedSearchCV(crf,
                            PARAM_SPACE,
                            cv=ps,
                            verbose=1,
                            n_jobs=args.n_jobs,
                            n_iter=args.n_iter,
                            scoring=f1_scorer,
                            return_train_score=True)

    logger.info('Start RandomizedSearchCV... {}'.format(crf))
    with parallel_backend('multiprocessing'):
        rs.fit(X_train_combined, y_train_combined)

    logger.info('best params: {}'.format(rs.best_params_))
    logger.info('best CV score: {}'.format(rs.best_score_))
    logger.info('model size: {:0.2f}M'.format(rs.best_estimator_.size_ /
                                              1000000))

    logger.info('Make predictions...')
    crf = rs.best_estimator_
    y_pred_train = crf.predict(X_train)
    y_pred_dev = crf.predict(X_dev)
    y_pred_test = crf.predict(X_test)

    train_utils.save_predictions(
        corpus_name=corpus.name,
        run_id=args.run_id,
        train=tagging_utils.sents_to_standoff(y_pred_train, train_docs),
        dev=tagging_utils.sents_to_standoff(y_pred_dev, dev_docs),
        test=tagging_utils.sents_to_standoff(y_pred_test, test_docs))
    _save_model_aritfacts(rs, model_dir, y_test, y_pred_test)
from deidentify.base import Annotation, Document
from deidentify.taggers import FlairTagger
from deidentify.tokenizer import TokenizerFactory

tokenizer = TokenizerFactory().tokenizer(corpus='ons')
tagger = FlairTagger(model='model_bilstmcrf_ons_fast-v0.2.0',
                     tokenizer=tokenizer)


def test_annotate():
    doc = Document(
        name='',
        text=
        'Hij werd op 10 oktober door arts Peter de Visser ontslagen van de kliniek.',
        annotations=[])

    anns = tagger.annotate([doc])[0].annotations
    assert anns == [
        Annotation(text='10 oktober',
                   start=12,
                   end=22,
                   tag='Date',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='Peter de Visser',
                   start=33,
                   end=48,
                   tag='Name',
                   doc_id='',
                   ann_id='T1')
    ]
Exemple #11
0
def main(args):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)

    logger.info('Loaded corpus: {}'.format(corpus))
    model_dir = train_utils.model_dir(corpus.name, args.run_id)
    os.makedirs(model_dir, exist_ok=True)

    logger.info('Get sentences...')
    train_sents, train_docs = flair_utils.standoff_to_flair_sents(corpus.train,
                                                                  tokenizer,
                                                                  verbose=True)
    dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(corpus.dev,
                                                              tokenizer,
                                                              verbose=True)
    test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test,
                                                                tokenizer,
                                                                verbose=True)

    flair_corpus = flair_utils.FilteredCorpus(train=train_sents,
                                              dev=dev_sents,
                                              test=test_sents,
                                              ignore_sentence=_ignore_sentence)
    logger.info(flair_corpus)

    if args.model_file:
        logger.info('Load existing model from {}'.format(args.model_file))
        tagger = SequenceTagger.load(args.model_file)
    else:
        logger.info('Train model...')
        tagger = get_model(
            flair_corpus,
            corpus_name=args.corpus,
            embedding_lang=args.embedding_lang,
            pooled_contextual_embeddings=args.pooled_contextual_embeddings,
            contextual_forward_path=args.contextual_forward_path,
            contextual_backward_path=args.contextual_backward_path)

    if args.fine_tune or not args.model_file:
        trainer = ModelTrainer(tagger, flair_corpus)
        trainer.train(join(model_dir, 'flair'),
                      max_epochs=150,
                      monitor_train=False,
                      train_with_dev=args.train_with_dev)

        if not args.train_with_dev:
            # Model performance is judged by dev data, so we also pick the best performing model
            # according to the dev score to make our final predictions.
            tagger = SequenceTagger.load(
                join(model_dir, 'flair', 'best-model.pt'))
        else:
            # Training is stopped if train loss converges - here, we do not have a "best model" and
            # use the final model to make predictions.
            pass

    logger.info('Make predictions...')
    make_predictions(tagger, flair_corpus)

    train_utils.save_predictions(
        corpus_name=corpus.name,
        run_id=args.run_id,
        train=flair_utils.flair_sents_to_standoff(train_sents, train_docs),
        dev=flair_utils.flair_sents_to_standoff(dev_sents, dev_docs),
        test=flair_utils.flair_sents_to_standoff(test_sents, test_docs))
Exemple #12
0
text = (
    "Dit is stukje tekst met daarin de naam Jan Jansen. De patient J. Jansen (e: "
    "[email protected], t: 06-12345678) is 64 jaar oud en woonachtig in Utrecht. Hij werd op 10 "
    "oktober door arts Peter de Visser ontslagen van de kliniek van het UMCU."
)

# Wrap text in document
documents = [
    Document(name='doc_01', text=text)
]

# Select downloaded model
model = 'model_bilstmcrf_ons_fast-v0.2.0'

# Instantiate tokenizer
tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner"))

# Load tagger with a downloaded model file and tokenizer
tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False)

# Annotate your documents
annotated_docs = tagger.annotate(documents)


from pprint import pprint

first_doc = annotated_docs[0]
pprint(first_doc.annotations)


from deidentify.util import mask_annotations
Exemple #13
0
    assert len(filtered_corpus.train) == 1
    assert filtered_corpus.train[0].to_plain_string(
    ) == 'this is should be included'
    assert len(filtered_corpus.dev) == 1
    assert filtered_corpus.dev[0].to_plain_string(
    ) == 'this is should be included'
    assert len(filtered_corpus.test) == 2
    assert filtered_corpus.test[0].to_plain_string(
    ) == 'this is should be included'
    assert filtered_corpus.test[1].to_plain_string() == 'and this as well'

    assert len(filtered_corpus.train_ignored) == 1
    assert filtered_corpus.train_ignored[0].to_plain_string(
    ) == '=== Answer: 123 ==='
    assert len(filtered_corpus.dev_ignored) == 1
    assert filtered_corpus.dev_ignored[0].to_plain_string(
    ) == '=== Answer: 456 ==='
    assert len(filtered_corpus.test_ignored) == 0


if __name__ == '__main__':
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
    tokenizer = TokenizerFactory().tokenizer('ons')
    docs = corpus.train
    sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer)

    print(repr(docs[0].text))
    for sent in sents:
        print(' '.join([repr(token.text) for token in sent.tokens]))
Exemple #14
0
def anonymize(input_file):

    with open(input_file, "r", encoding="utf-8") as f:
        text = f.read()

    # Wrap text in document
    documents = [Document(name='doc_01', text=text)]

    # Select downloaded model
    model = 'models/model_bilstmcrf_ons_fast-v0.1.0/final-model.pt'

    nlp = spacy.load('de_core_news_sm')

    # Instantiate tokenizer
    tokenizer = TokenizerFactory().tokenizer(corpus='germeval',
                                             disable=("tagger", "ner"),
                                             model=nlp)

    # Load tagger with a downloaded model file and tokenizer
    tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False)

    # Annotate your documents
    annotated_doc = tagger.annotate(documents)[0]

    # Spacy NER extraction
    ners = nlp(text)

    filtered_annotations = []

    # Dict for storing SpaCy and deidentify tag correspondences
    tag_dict = {
        "PER": "Name",
        "LOC": "Address",
        "ORG": "Organization_Company",
        "MISC": "Other"
    }

    # Add all SpaCy-detected NEs to list
    for ent in ners.ents:
        filtered_annotations.append({
            "text": ent.text,
            "start": ent.start_char,
            "end": ent.end_char,
            "tag": tag_dict[ent.label_]
        })

    for ann in annotated_doc.annotations:
        # discard names; they have a high likelihood of false positives since
        # nouns are capitalized in German, unlike in Dutch
        if ann.tag == "Name":
            continue
        # don't add the entity if it overlaps with SpaCy's - SpaCy makes fewer mistakes
        if True in [ent.start_char <= ann.end <= ent.end_char for ent in ners.ents] or \
            True in [ann.start <= ent.end_char <= ann.end for ent in ners.ents]:
            continue
        filtered_annotations.append({
            "text": ann.text,
            "start": ann.start,
            "end": ann.end,
            "tag": ann.tag
        })

    filtered_annotations.sort(key=lambda x: x["start"])

    masked_output = mask_annotations(annotated_doc.text, filtered_annotations)
    print(masked_output)