def test_standoff_to_flair_sents(): corpus = CorpusLoader().load_corpus(DUMMY_CORPUS) tokenizer = TokenizerFactory().tokenizer('ons') docs = corpus.train sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer) assert len(sents) == 14 assert len(parsed_docs) == 14 bio_tags = [token.get_tag('ner').value for token in sents[0]] token_texts = [token.text for token in sents[0]] assert token_texts == ['Linders', ',', 'Xandro', '<'] assert bio_tags == ['B-Name', 'I-Name', 'I-Name', 'O'] bio_tags = [token.get_tag('ner').value for token in sents[1]] token_texts = [token.text for token in sents[1]] assert token_texts == ['*****@*****.**'] assert bio_tags == ['B-Email'] bio_tags = [token.get_tag('ner').value for token in sents[2]] token_texts = [token.text for token in sents[2]] assert token_texts == ['>', '<SPACE>', '07', 'apr', '.', '<SPACE>'] assert bio_tags == [ 'O', 'O', 'B-Date', 'I-Date', 'O', 'O', ]
def test_flair_sentence_with_whitespace_tokens(): text = 'Mw geniet zichtbaar. Maarten is de afgelopen periode veelal afwezig.' annotation = Annotation(text='Maarten', start=text.index('Maarten'), end=text.index('Maarten') + len('Maarten'), tag='PERSON') doc = Document(name='', text=text, annotations=[annotation]) tokenizer = TokenizerFactory().tokenizer('ons') flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer) # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained # in the Flair sentence, otherwise it's not possible to reconstruct the original document from # the tokenized representation. assert [token.text for token in flair_sents[0] ] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>'] spacy_doc = docs[0].spacy_doc spacy_sents = list(spacy_doc.sents) assert len(flair_sents) == 2 assert len(spacy_sents) == 2 assert len(flair_sents[0]) == 5 assert len(spacy_sents[0]) == 5 assert len(flair_sents[1]) == 8 assert len(spacy_sents[1]) == 8
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) # We need to pass some dev data, otherwise flair raises a ZeroDivisionError # See: https://github.com/zalandoresearch/flair/issues/1139 # We just split the training sample into half and instruct Flair to train_with_dev (see below). half = len(train_sents_sample) // 2 flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half], dev=train_sents_sample[half:], test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) logger.info('Train model...') tagger = run_bilstmcrf.get_model(flair_corpus, corpus_name=args.corpus, embedding_lang=args.embedding_lang, pooled_contextual_embeddings=True) trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=True, save_final_model=args.save_final_model) logger.info('Make predictions...') run_bilstmcrf.make_predictions(tagger, flair_corpus) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(_is_not_meta_sentence, train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[args.feature_extractor] X_train, y_train = crf_labeler.sents_to_features_and_labels(train_sents_sample, feature_extractor) X_test, _ = crf_labeler.sents_to_features_and_labels(test_sents, feature_extractor) logger.info('len(X_train) = {}'.format(len(X_train))) logger.info('len(y_train) = {}'.format(len(y_train))) logger.info('len(X_test) = {}'.format(len(X_test))) crf = crf_labeler.SentenceFilterCRF( ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) logger.info('Start training... {}'.format(crf)) crf.fit(X_train, y_train) logger.info('CRF classes: {}'.format(crf.classes_)) logger.info('Make predictions...') y_pred_test = crf.predict(X_test) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=tagging_utils.sents_to_standoff(y_pred_test, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
def main(args): logger.info('Load data...') documents, num_tokens = load_data() logger.info('Initialize taggers...') tokenizer_crf = TokenizerFactory().tokenizer(corpus='ons', disable=()) tokenizer_bilstm = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner")) taggers = [('DEDUCE', DeduceTagger(verbose=True)), ('CRF', CRFTagger(model='model_crf_ons_tuned-v0.1.0', tokenizer=tokenizer_crf, verbose=True)), ('BiLSTM-CRF (large)', FlairTagger(model='model_bilstmcrf_ons_large-v0.1.0', tokenizer=tokenizer_bilstm, mini_batch_size=args.bilstmcrf_large_batch_size, verbose=True)), ('BiLSTM-CRF (fast)', FlairTagger(model='model_bilstmcrf_ons_fast-v0.1.0', tokenizer=tokenizer_bilstm, mini_batch_size=args.bilstmcrf_fast_batch_size, verbose=True))] benchmark_results = [] tagger_names = [] for tagger_name, tagger in taggers: logger.info(f'Benchmark inference for tagger: {tagger_name}') scores = benchmark_tagger(tagger, documents, num_tokens) benchmark_results.append(scores) tagger_names.append(tagger_name) df = pd.DataFrame(data=benchmark_results, index=tagger_names) df.to_csv(f'{args.benchmark_name}.csv') logger.info('\n{}', df)
def main(args): corpus_loader = CorpusLoader() corpus = corpus_loader.load_corpus(CORPUS_PATH[args.corpus]) logger.info(corpus) df_train = annotations_to_pandas(corpus.train) df_train['part'] = 'train' df_dev = annotations_to_pandas(corpus.dev) df_dev['part'] = 'dev' df_test = annotations_to_pandas(corpus.test) df_test['part'] = 'test' df_all = pd.concat([df_train, df_dev, df_test]) df_absolute = df_all \ .groupby(['tag', 'part']) \ .size() \ .unstack() \ .sort_values(by='train', ascending=False) \ .fillna(0) \ .astype(int) df_absolute = df_absolute.add_suffix('_phi') df_normalized = df_absolute / df_absolute.sum() df_normalized = df_normalized.add_suffix('_normalized') df_normalized = df_normalized.fillna(0) df_phi_stats = pd.concat([df_absolute, df_normalized], axis=1) df_phi_stats['all_phi'] = df_all.tag.value_counts() df_phi_stats['all_phi_normalized'] = df_all.tag.value_counts(normalize=True) df_phi_stats.round(3).sort_values('all_phi', ascending=False).to_csv( join(OUT_DIR, 'phi_distribution.csv')) with open(join(OUT_DIR, 'phi_summary.txt'), 'w') as f: print('All PHI: {}'.format(len(df_all)), file=f) print('PHI by data split:', file=f) print(df_phi_stats[['dev_phi', 'test_phi', 'train_phi']].sum(), file=f) plot_phi_distribution(df_phi_stats, num_total=len(df_all)) all_documents = corpus.train + corpus.dev + corpus.test tokenizer = TokenizerFactory().tokenizer(args.corpus, disable=['tagger', 'ner']) doc_stats = analyze_documents(all_documents, tokenizer) with open(join(OUT_DIR, 'corpus_statistics.txt'), 'w') as f: write_doc_stats(doc_stats, file=f)
def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') docs = list(itertools.chain(corpus.train, corpus.dev)) sents, _ = tagging_utils.standoff_to_sents(docs, tokenizer, verbose=True) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[ args.feature_extractor] X, y = crf_labeler.sents_to_features_and_labels(sents, feature_extractor) logger.info('len(X) = {}'.format(len(X))) logger.info('len(y) = {}'.format(len(y))) crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) logger.info('Start learing curve computation...') with parallel_backend('multiprocessing'): # scikit-learn changed the default multiprocessing to 'loky' in 0.21. It appears that this # is not supported by sklearn_crfsuite. Therefore, we switch to the legacy 'multiprocessing' # parallel backend. plot_learning_curve(crf, 'CRF learning curve (sentences: N={})'.format( len(X)), X, y, out_dir=model_dir, cv=5, n_jobs=12) logger.info('Done...')
def test_flair_sents_to_standoff(): corpus = CorpusLoader().load_corpus(DUMMY_CORPUS) tokenizer = TokenizerFactory().tokenizer('ons') docs_expected = corpus.train sents, parsed_docs = flair_utils.standoff_to_flair_sents( docs_expected, tokenizer) docs_actual = flair_utils.flair_sents_to_standoff(sents, parsed_docs) assert len(docs_actual) == 1 assert len(docs_expected) == 1 assert len(docs_actual[0].annotations) == 16 assert len(docs_expected[0].annotations) == 16 for ann_expected, ann_actual in zip(docs_expected[0].annotations, docs_actual[0].annotations): assert ann_expected.text == ann_actual.text assert ann_expected.tag == ann_actual.tag
def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') train_sents, train_docs = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True) dev_sents, dev_docs = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[ args.feature_extractor] X_train, y_train = crf_labeler.sents_to_features_and_labels( train_sents, feature_extractor) X_dev, y_dev = crf_labeler.sents_to_features_and_labels( dev_sents, feature_extractor) X_test, y_test = crf_labeler.sents_to_features_and_labels( test_sents, feature_extractor) logger.info('len(X_train) = {}'.format(len(X_train))) logger.info('len(y_train) = {}'.format(len(y_train))) logger.info('len(X_dev) = {}'.format(len(X_dev))) logger.info('len(X_test) = {}'.format(len(X_test))) X_train_combined = X_train + X_dev y_train_combined = y_train + y_dev train_indices = [-1] * len(X_train) dev_indices = [0] * len(X_dev) test_fold = train_indices + dev_indices labels = list(set(label for sent in y_train_combined for label in sent)) labels.remove('O') logger.info('Labels: {}'.format(labels)) f1_scorer = make_scorer(flat_f1_score, labels=labels, average='micro') crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) ps = PredefinedSplit(test_fold) rs = RandomizedSearchCV(crf, PARAM_SPACE, cv=ps, verbose=1, n_jobs=args.n_jobs, n_iter=args.n_iter, scoring=f1_scorer, return_train_score=True) logger.info('Start RandomizedSearchCV... {}'.format(crf)) with parallel_backend('multiprocessing'): rs.fit(X_train_combined, y_train_combined) logger.info('best params: {}'.format(rs.best_params_)) logger.info('best CV score: {}'.format(rs.best_score_)) logger.info('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) logger.info('Make predictions...') crf = rs.best_estimator_ y_pred_train = crf.predict(X_train) y_pred_dev = crf.predict(X_dev) y_pred_test = crf.predict(X_test) train_utils.save_predictions( corpus_name=corpus.name, run_id=args.run_id, train=tagging_utils.sents_to_standoff(y_pred_train, train_docs), dev=tagging_utils.sents_to_standoff(y_pred_dev, dev_docs), test=tagging_utils.sents_to_standoff(y_pred_test, test_docs)) _save_model_aritfacts(rs, model_dir, y_test, y_pred_test)
from deidentify.base import Annotation, Document from deidentify.taggers import FlairTagger from deidentify.tokenizer import TokenizerFactory tokenizer = TokenizerFactory().tokenizer(corpus='ons') tagger = FlairTagger(model='model_bilstmcrf_ons_fast-v0.2.0', tokenizer=tokenizer) def test_annotate(): doc = Document( name='', text= 'Hij werd op 10 oktober door arts Peter de Visser ontslagen van de kliniek.', annotations=[]) anns = tagger.annotate([doc])[0].annotations assert anns == [ Annotation(text='10 oktober', start=12, end=22, tag='Date', doc_id='', ann_id='T0'), Annotation(text='Peter de Visser', start=33, end=48, tag='Name', doc_id='', ann_id='T1') ]
def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') train_sents, train_docs = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) flair_corpus = flair_utils.FilteredCorpus(train=train_sents, dev=dev_sents, test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) if args.model_file: logger.info('Load existing model from {}'.format(args.model_file)) tagger = SequenceTagger.load(args.model_file) else: logger.info('Train model...') tagger = get_model( flair_corpus, corpus_name=args.corpus, embedding_lang=args.embedding_lang, pooled_contextual_embeddings=args.pooled_contextual_embeddings, contextual_forward_path=args.contextual_forward_path, contextual_backward_path=args.contextual_backward_path) if args.fine_tune or not args.model_file: trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=args.train_with_dev) if not args.train_with_dev: # Model performance is judged by dev data, so we also pick the best performing model # according to the dev score to make our final predictions. tagger = SequenceTagger.load( join(model_dir, 'flair', 'best-model.pt')) else: # Training is stopped if train loss converges - here, we do not have a "best model" and # use the final model to make predictions. pass logger.info('Make predictions...') make_predictions(tagger, flair_corpus) train_utils.save_predictions( corpus_name=corpus.name, run_id=args.run_id, train=flair_utils.flair_sents_to_standoff(train_sents, train_docs), dev=flair_utils.flair_sents_to_standoff(dev_sents, dev_docs), test=flair_utils.flair_sents_to_standoff(test_sents, test_docs))
text = ( "Dit is stukje tekst met daarin de naam Jan Jansen. De patient J. Jansen (e: " "[email protected], t: 06-12345678) is 64 jaar oud en woonachtig in Utrecht. Hij werd op 10 " "oktober door arts Peter de Visser ontslagen van de kliniek van het UMCU." ) # Wrap text in document documents = [ Document(name='doc_01', text=text) ] # Select downloaded model model = 'model_bilstmcrf_ons_fast-v0.2.0' # Instantiate tokenizer tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner")) # Load tagger with a downloaded model file and tokenizer tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False) # Annotate your documents annotated_docs = tagger.annotate(documents) from pprint import pprint first_doc = annotated_docs[0] pprint(first_doc.annotations) from deidentify.util import mask_annotations
assert len(filtered_corpus.train) == 1 assert filtered_corpus.train[0].to_plain_string( ) == 'this is should be included' assert len(filtered_corpus.dev) == 1 assert filtered_corpus.dev[0].to_plain_string( ) == 'this is should be included' assert len(filtered_corpus.test) == 2 assert filtered_corpus.test[0].to_plain_string( ) == 'this is should be included' assert filtered_corpus.test[1].to_plain_string() == 'and this as well' assert len(filtered_corpus.train_ignored) == 1 assert filtered_corpus.train_ignored[0].to_plain_string( ) == '=== Answer: 123 ===' assert len(filtered_corpus.dev_ignored) == 1 assert filtered_corpus.dev_ignored[0].to_plain_string( ) == '=== Answer: 456 ===' assert len(filtered_corpus.test_ignored) == 0 if __name__ == '__main__': corpus = CorpusLoader().load_corpus(DUMMY_CORPUS) tokenizer = TokenizerFactory().tokenizer('ons') docs = corpus.train sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer) print(repr(docs[0].text)) for sent in sents: print(' '.join([repr(token.text) for token in sent.tokens]))
def anonymize(input_file): with open(input_file, "r", encoding="utf-8") as f: text = f.read() # Wrap text in document documents = [Document(name='doc_01', text=text)] # Select downloaded model model = 'models/model_bilstmcrf_ons_fast-v0.1.0/final-model.pt' nlp = spacy.load('de_core_news_sm') # Instantiate tokenizer tokenizer = TokenizerFactory().tokenizer(corpus='germeval', disable=("tagger", "ner"), model=nlp) # Load tagger with a downloaded model file and tokenizer tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False) # Annotate your documents annotated_doc = tagger.annotate(documents)[0] # Spacy NER extraction ners = nlp(text) filtered_annotations = [] # Dict for storing SpaCy and deidentify tag correspondences tag_dict = { "PER": "Name", "LOC": "Address", "ORG": "Organization_Company", "MISC": "Other" } # Add all SpaCy-detected NEs to list for ent in ners.ents: filtered_annotations.append({ "text": ent.text, "start": ent.start_char, "end": ent.end_char, "tag": tag_dict[ent.label_] }) for ann in annotated_doc.annotations: # discard names; they have a high likelihood of false positives since # nouns are capitalized in German, unlike in Dutch if ann.tag == "Name": continue # don't add the entity if it overlaps with SpaCy's - SpaCy makes fewer mistakes if True in [ent.start_char <= ann.end <= ent.end_char for ent in ners.ents] or \ True in [ann.start <= ent.end_char <= ann.end for ent in ners.ents]: continue filtered_annotations.append({ "text": ann.text, "start": ann.start, "end": ann.end, "tag": ann.tag }) filtered_annotations.sort(key=lambda x: x["start"]) masked_output = mask_annotations(annotated_doc.text, filtered_annotations) print(masked_output)