def test_concepts_performance(events_service, concepts_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'concepts' recall = Accuracy(name='recall', mode='any', fields=['cui']) precision = Accuracy(name='precision', mode='any', fields=['cui']) with EventsClient(address=events_service) as client, \ Pipeline( RemoteProcessor(processor_id='biomedicus-concepts', address=concepts_service), LocalProcessor(Metrics(recall, tested='umls_concepts', target='gold_concepts'), component_id='metrics', client=client), LocalProcessor(Metrics(precision, tested='gold_concepts', target='umls_concepts'), component_id='metrics_reverse', client=client) ) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] pipeline.run(document) print('Precision:', precision.value) print('Recall:', recall.value) timing_info = pipeline.processor_timer_stats()[0].timing_info test_results['Concepts'] = { 'Precision': precision.value, 'Recall': recall.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert recall.value > 0.6
def test_deserialization(): f = Path(__file__).parent / 'event.json' event = JsonSerializer.file_to_event(f) assert event.event_id == '12345' assert event.metadata['foo'] == 'bar' d = event.documents['plaintext'] assert d.text == "The quick brown fox jumps over the lazy dog." assert len(d.get_label_indices_info()) == 3 assert d.get_label_index("one") == [ GenericLabel(start_index=0, end_index=10, a="b"), GenericLabel(start_index=12, end_index=25, a="c"), GenericLabel(start_index=26, end_index=52, a="d"), GenericLabel(start_index=53, end_index=85, a="e"), ] assert d.get_label_index("two") == [ GenericLabel(start_index=0, end_index=10, x=1), GenericLabel(start_index=3, end_index=9, x=3), GenericLabel(start_index=4, end_index=25, x=2), GenericLabel(start_index=5, end_index=25, x=4), ] assert d.get_label_index("three") == [ GenericLabel(start_index=0, end_index=10, x=True), GenericLabel(start_index=3, end_index=9, x=True), GenericLabel(start_index=4, end_index=25, x=False), GenericLabel(start_index=5, end_index=25, x=False), ]
def test_sentence_performance(events_service, sentences_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'sentences' confusion = metrics.FirstTokenConfusion() with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-sentences', address=sentences_service), LocalProcessor(metrics.Metrics(confusion, tested='sentences', target='Sentence'), component_id='metrics', client=client)) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] results = pipeline.run(document) print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format( event.event_id, results[1].results['first_token_confusion']['f1'], results[0].timing_info['process_method'])) print('Overall Precision:', confusion.precision) print('Overall Recall:', confusion.recall) print('Overall F1:', confusion.f1) pipeline.print_times() timing_info = pipeline.processor_timer_stats()[0].timing_info test_results['Sentences'] = { 'Precision': confusion.precision, 'Recall': confusion.recall, 'F1': confusion.f1, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert confusion.f1 > 0.85
def test_tnt_performance(events_service, pos_tags_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'pos_tags' accuracy = Accuracy() with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-tnt-tagger', address=pos_tags_service, params={'token_index': 'gold_tags'}), LocalProcessor(Metrics(accuracy, tested='pos_tags', target='gold_tags'), component_id='metrics', client=client) ) as pipeline: for test_file in input_dir.glob('**/*.json'): event = JsonSerializer.file_to_event(test_file, client=client) with event: document = event.documents['gold'] results = pipeline.run(document) print('Accuracy for event - ', event.event_id, ':', results[1].results['accuracy']) print('Accuracy:', accuracy.value) pipeline.print_times() timing_info = pipeline.processor_timer_stats()[0].timing_info test_results['TnT Pos Tagger'] = { 'Accuracy': accuracy.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert accuracy.value > 0.9
def test_acronyms_performance(events_service, acronyms_service, test_results): input_dir = Path(os.environ['BIOMEDICUS_PHI_TEST_DATA']) / 'acronyms' top_score_accuracy = Accuracy(name='top_score_accuracy', fields=['expansion']) any_accuracy = Accuracy(name='any_accuracy', mode='any', fields=['expansion']) detection_recall = Accuracy(name='detection_recall', mode='location', fields=['expansion']) detection_precision = Accuracy(name='detection_precision', mode='location', fields=['expansion']) with EventsClient(address=events_service) as client, Pipeline( RemoteProcessor(processor_id='biomedicus-acronyms', address=acronyms_service), LocalProcessor(Metrics(top_score_accuracy, detection_recall, tested='acronyms', target='gold_acronyms'), component_id='top_score_metrics', client=client), LocalProcessor(Metrics(detection_precision, tested='gold_acronyms', target='acronyms'), component_id='top_score_reverse', client=client), LocalProcessor(Metrics(any_accuracy, tested='all_acronym_senses', target='gold_acronyms'), component_id='all_senses_metrics', client=client)) as pipeline: for test_file in input_dir.glob('**/*.json'): with JsonSerializer.file_to_event(test_file, client=client) as event: document = event.documents['plaintext'] pipeline.run(document) print('Top Sense Accuracy:', top_score_accuracy.value) print('Any Sense Accuracy:', any_accuracy.value) print('Detection Recall:', detection_recall.value) print('Detection Precision:', detection_precision.value) pipeline.print_times() timing_info = pipeline.processor_timer_stats( 'biomedicus-acronyms').timing_info test_results['acronyms'] = { 'Top sense accuracy': top_score_accuracy.value, 'Any sense accuracy': any_accuracy.value, 'Detection Recall': detection_recall.value, 'Detection Precision': detection_precision.value, 'Remote Call Duration': str(timing_info['remote_call'].mean), 'Process Method Duration': str(timing_info['process_method'].mean) } assert top_score_accuracy.value > 0.4 assert any_accuracy.value > 0.4 assert detection_recall.value > 0.65
def test_json_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) one = mtap.GenericLabel(start_index=0, end_index=5, x=10) two = mtap.GenericLabel(start_index=6, end_index=10, x=15) document.add_labels('one', [one, two]) document.add_labels('two', [ mtap.GenericLabel(start_index=0, end_index=25, a='b', b=one), mtap.GenericLabel(start_index=26, end_index=42, a='c', b=two) ]) document.add_labels('three', [ mtap.GenericLabel(start_index=0, end_index=10, foo=True), mtap.GenericLabel(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: JsonSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) e = JsonSerializer.file_to_event(tf) assert e.event_id == event.event_id assert e.metadata['foo'] == 'bar' d = e.documents['plaintext'] assert d.text == document.text index_one = d.labels['one'] assert index_one == [one, two] index_two = d.labels['two'] assert index_two == [ mtap.GenericLabel(start_index=0, end_index=25, a='b', b=one), mtap.GenericLabel(start_index=26, end_index=42, a='c', b=two) ] index_three = d.labels['three'] assert index_three == [ mtap.GenericLabel(start_index=0, end_index=10, foo=True), mtap.GenericLabel(start_index=11, end_index=15, foo=False) ]
def test_normalization(events_service, normalization_processor): with EventsClient(address=events_service) as client, \ Pipeline(RemoteProcessor(processor_id='biomedicus_normalizer', address=normalization_processor)) as pipeline, \ JsonSerializer.file_to_event(Path(__file__).parent / '97_95.json', client=client) as event: document = event.documents['plaintext'] pipeline.run(document) for norm_form in document.get_label_index('norm_forms'): if norm_form.text == "according": assert norm_form.norm == "accord" if norm_form.text == "expressing": assert norm_form.norm == "express" if norm_form.text == "receiving": assert norm_form.norm == "receive" if norm_form.text == "days": assert norm_form.norm == "day"
def test_json_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) document.add_labels('one', [ mtap.GenericLabel(start_index=0, end_index=5, x=10), mtap.GenericLabel(start_index=6, end_index=10, x=15) ]) document.add_labels('two', [ mtap.GenericLabel(start_index=0, end_index=25, a='b'), mtap.GenericLabel(start_index=26, end_index=42, a='c') ]) document.add_labels('three', [ mtap.GenericLabel(start_index=0, end_index=10, foo=True), mtap.GenericLabel(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: JsonSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) o = json.load(tf) assert o['event_id'] == '1' assert o['metadata']['foo'] == 'bar' d = o['documents']['plaintext'] assert d['text'] == 'Some text.' assert len(d['label_indices']) == 3 assert d['label_indices']['one'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 5, 'x': 10 }, { 'start_index': 6, 'end_index': 10, 'x': 15 }], 'distinct': False } assert d['label_indices']['two'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 25, 'a': 'b' }, { 'start_index': 26, 'end_index': 42, 'a': 'c' }], 'distinct': False } assert d['label_indices']['three'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 10, 'foo': True }, { 'start_index': 11, 'end_index': 15, 'foo': False }], 'distinct': True }