Example #1
0
def test_concepts_performance(events_service, concepts_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'concepts'
    recall = Accuracy(name='recall', mode='any', fields=['cui'])
    precision = Accuracy(name='precision', mode='any', fields=['cui'])
    with EventsClient(address=events_service) as client, \
            Pipeline(
                RemoteProcessor(processor_id='biomedicus-concepts', address=concepts_service),
                LocalProcessor(Metrics(recall, tested='umls_concepts', target='gold_concepts'),
                               component_id='metrics', client=client),
                LocalProcessor(Metrics(precision, tested='gold_concepts', target='umls_concepts'),
                               component_id='metrics_reverse', client=client)
            ) as pipeline:
        for test_file in input_dir.glob('**/*.json'):
            with JsonSerializer.file_to_event(test_file,
                                              client=client) as event:
                document = event.documents['plaintext']
                pipeline.run(document)

    print('Precision:', precision.value)
    print('Recall:', recall.value)
    timing_info = pipeline.processor_timer_stats()[0].timing_info
    test_results['Concepts'] = {
        'Precision': precision.value,
        'Recall': recall.value,
        'Remote Call Duration': str(timing_info['remote_call'].mean),
        'Process Method Duration': str(timing_info['process_method'].mean)
    }
    assert recall.value > 0.6
Example #2
0
def test_deserialization():
    f = Path(__file__).parent / 'event.json'
    event = JsonSerializer.file_to_event(f)
    assert event.event_id == '12345'
    assert event.metadata['foo'] == 'bar'
    d = event.documents['plaintext']
    assert d.text == "The quick brown fox jumps over the lazy dog."
    assert len(d.get_label_indices_info()) == 3
    assert d.get_label_index("one") == [
        GenericLabel(start_index=0, end_index=10, a="b"),
        GenericLabel(start_index=12, end_index=25, a="c"),
        GenericLabel(start_index=26, end_index=52, a="d"),
        GenericLabel(start_index=53, end_index=85, a="e"),
    ]
    assert d.get_label_index("two") == [
        GenericLabel(start_index=0, end_index=10, x=1),
        GenericLabel(start_index=3, end_index=9, x=3),
        GenericLabel(start_index=4, end_index=25, x=2),
        GenericLabel(start_index=5, end_index=25, x=4),
    ]
    assert d.get_label_index("three") == [
        GenericLabel(start_index=0, end_index=10, x=True),
        GenericLabel(start_index=3, end_index=9, x=True),
        GenericLabel(start_index=4, end_index=25, x=False),
        GenericLabel(start_index=5, end_index=25, x=False),
    ]
Example #3
0
def test_sentence_performance(events_service, sentences_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'sentences'

    confusion = metrics.FirstTokenConfusion()
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor(processor_id='biomedicus-sentences',
                            address=sentences_service),
            LocalProcessor(metrics.Metrics(confusion,
                                           tested='sentences',
                                           target='Sentence'),
                           component_id='metrics',
                           client=client)) as pipeline:
        for test_file in input_dir.glob('**/*.json'):
            with JsonSerializer.file_to_event(test_file,
                                              client=client) as event:
                document = event.documents['plaintext']
                results = pipeline.run(document)
                print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format(
                    event.event_id,
                    results[1].results['first_token_confusion']['f1'],
                    results[0].timing_info['process_method']))

        print('Overall Precision:', confusion.precision)
        print('Overall Recall:', confusion.recall)
        print('Overall F1:', confusion.f1)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats()[0].timing_info
        test_results['Sentences'] = {
            'Precision': confusion.precision,
            'Recall': confusion.recall,
            'F1': confusion.f1,
            'Remote Call Duration': str(timing_info['remote_call'].mean),
            'Process Method Duration': str(timing_info['process_method'].mean)
        }
        assert confusion.f1 > 0.85
def test_tnt_performance(events_service, pos_tags_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'pos_tags'
    accuracy = Accuracy()
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor(processor_id='biomedicus-tnt-tagger', address=pos_tags_service,
                            params={'token_index': 'gold_tags'}),
            LocalProcessor(Metrics(accuracy, tested='pos_tags', target='gold_tags'),
                           component_id='metrics', client=client)
    ) as pipeline:
        for test_file in input_dir.glob('**/*.json'):
            event = JsonSerializer.file_to_event(test_file, client=client)
            with event:
                document = event.documents['gold']
                results = pipeline.run(document)
                print('Accuracy for event - ', event.event_id, ':', results[1].results['accuracy'])

        print('Accuracy:', accuracy.value)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats()[0].timing_info
        test_results['TnT Pos Tagger'] = {
            'Accuracy': accuracy.value,
            'Remote Call Duration': str(timing_info['remote_call'].mean),
            'Process Method Duration': str(timing_info['process_method'].mean)
        }
        assert accuracy.value > 0.9
def test_acronyms_performance(events_service, acronyms_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_PHI_TEST_DATA']) / 'acronyms'
    top_score_accuracy = Accuracy(name='top_score_accuracy',
                                  fields=['expansion'])
    any_accuracy = Accuracy(name='any_accuracy',
                            mode='any',
                            fields=['expansion'])
    detection_recall = Accuracy(name='detection_recall',
                                mode='location',
                                fields=['expansion'])
    detection_precision = Accuracy(name='detection_precision',
                                   mode='location',
                                   fields=['expansion'])
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor(processor_id='biomedicus-acronyms',
                            address=acronyms_service),
            LocalProcessor(Metrics(top_score_accuracy,
                                   detection_recall,
                                   tested='acronyms',
                                   target='gold_acronyms'),
                           component_id='top_score_metrics',
                           client=client),
            LocalProcessor(Metrics(detection_precision,
                                   tested='gold_acronyms',
                                   target='acronyms'),
                           component_id='top_score_reverse',
                           client=client),
            LocalProcessor(Metrics(any_accuracy,
                                   tested='all_acronym_senses',
                                   target='gold_acronyms'),
                           component_id='all_senses_metrics',
                           client=client)) as pipeline:
        for test_file in input_dir.glob('**/*.json'):
            with JsonSerializer.file_to_event(test_file,
                                              client=client) as event:
                document = event.documents['plaintext']
                pipeline.run(document)

        print('Top Sense Accuracy:', top_score_accuracy.value)
        print('Any Sense Accuracy:', any_accuracy.value)
        print('Detection Recall:', detection_recall.value)
        print('Detection Precision:', detection_precision.value)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats(
            'biomedicus-acronyms').timing_info
        test_results['acronyms'] = {
            'Top sense accuracy': top_score_accuracy.value,
            'Any sense accuracy': any_accuracy.value,
            'Detection Recall': detection_recall.value,
            'Detection Precision': detection_precision.value,
            'Remote Call Duration': str(timing_info['remote_call'].mean),
            'Process Method Duration': str(timing_info['process_method'].mean)
        }
        assert top_score_accuracy.value > 0.4
        assert any_accuracy.value > 0.4
        assert detection_recall.value > 0.65
Example #6
0
def test_json_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    one = mtap.GenericLabel(start_index=0, end_index=5, x=10)
    two = mtap.GenericLabel(start_index=6, end_index=10, x=15)
    document.add_labels('one', [one, two])
    document.add_labels('two', [
        mtap.GenericLabel(start_index=0, end_index=25, a='b', b=one),
        mtap.GenericLabel(start_index=26, end_index=42, a='c', b=two)
    ])
    document.add_labels('three', [
        mtap.GenericLabel(start_index=0, end_index=10, foo=True),
        mtap.GenericLabel(start_index=11, end_index=15, foo=False)
    ],
                        distinct=True)

    with TemporaryFile('w+') as tf:
        JsonSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        e = JsonSerializer.file_to_event(tf)

    assert e.event_id == event.event_id
    assert e.metadata['foo'] == 'bar'
    d = e.documents['plaintext']
    assert d.text == document.text
    index_one = d.labels['one']
    assert index_one == [one, two]
    index_two = d.labels['two']
    assert index_two == [
        mtap.GenericLabel(start_index=0, end_index=25, a='b', b=one),
        mtap.GenericLabel(start_index=26, end_index=42, a='c', b=two)
    ]
    index_three = d.labels['three']
    assert index_three == [
        mtap.GenericLabel(start_index=0, end_index=10, foo=True),
        mtap.GenericLabel(start_index=11, end_index=15, foo=False)
    ]
Example #7
0
def test_normalization(events_service, normalization_processor):
    with EventsClient(address=events_service) as client, \
            Pipeline(RemoteProcessor(processor_id='biomedicus_normalizer',
                                     address=normalization_processor)) as pipeline, \
            JsonSerializer.file_to_event(Path(__file__).parent / '97_95.json',
                                         client=client) as event:
        document = event.documents['plaintext']
        pipeline.run(document)
        for norm_form in document.get_label_index('norm_forms'):
            if norm_form.text == "according":
                assert norm_form.norm == "accord"
            if norm_form.text == "expressing":
                assert norm_form.norm == "express"
            if norm_form.text == "receiving":
                assert norm_form.norm == "receive"
            if norm_form.text == "days":
                assert norm_form.norm == "day"
Example #8
0
def test_json_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    document.add_labels('one', [
        mtap.GenericLabel(start_index=0, end_index=5, x=10),
        mtap.GenericLabel(start_index=6, end_index=10, x=15)
    ])
    document.add_labels('two', [
        mtap.GenericLabel(start_index=0, end_index=25, a='b'),
        mtap.GenericLabel(start_index=26, end_index=42, a='c')
    ])
    document.add_labels('three', [
        mtap.GenericLabel(start_index=0, end_index=10, foo=True),
        mtap.GenericLabel(start_index=11, end_index=15, foo=False)
    ],
                        distinct=True)

    with TemporaryFile('w+') as tf:
        JsonSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        o = json.load(tf)

    assert o['event_id'] == '1'
    assert o['metadata']['foo'] == 'bar'
    d = o['documents']['plaintext']
    assert d['text'] == 'Some text.'
    assert len(d['label_indices']) == 3
    assert d['label_indices']['one'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 5,
            'x': 10
        }, {
            'start_index': 6,
            'end_index': 10,
            'x': 15
        }],
        'distinct':
        False
    }
    assert d['label_indices']['two'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 25,
            'a': 'b'
        }, {
            'start_index': 26,
            'end_index': 42,
            'a': 'c'
        }],
        'distinct':
        False
    }
    assert d['label_indices']['three'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 10,
            'foo': True
        }, {
            'start_index': 11,
            'end_index': 15,
            'foo': False
        }],
        'distinct':
        True
    }