Ejemplo n.º 1
0
def test_offset_mapping_matches_pos_mapped_manually():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    document_data = dict(cdata_xpath='./TEXT')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = document_data ,
                                     skip_chars = r'[\s]' )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = offset_mapping ,
                                               annotation_path = \
                                                 './TAGS/DATE' ,
                                               tag_name = 'DateTime' ,
                                               begin_attribute = 'start' ,
                                               end_attribute = 'end' )
    ##
    assert strict_starts['87'][0]['begin_pos'] == '87'
    assert strict_starts[ '87' ][ 0 ][ 'begin_pos_mapped' ] == \
        offset_mapping[ '87' ]
    assert strict_starts['87'][0]['end_pos'] == '97'
    assert strict_starts[ '87' ][ 0 ][ 'end_pos_mapped' ] == \
        offset_mapping[ '97' ]
    ##
    assert strict_starts['2404'][0]['begin_pos'] == '2404'
    assert strict_starts[ '2404' ][ 0 ][ 'begin_pos_mapped' ] == \
        offset_mapping[ '2404' ]
    assert strict_starts['2404'][0]['end_pos'] == '2410'
    assert strict_starts[ '2404' ][ 0 ][ 'end_pos_mapped' ] == \
        offset_mapping[ '2409' ]
Ejemplo n.º 2
0
def test_offset_mapping_matches_pos_mapped_automatically():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    document_data = dict(cdata_xpath='./TEXT')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = document_data ,
                                     skip_chars = r'[\s]' )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                                  offset_mapping = offset_mapping ,
                                                  annotation_path = \
                                                      './TAGS/DATE' ,
                                                  tag_name = 'DateTime' ,
                                                  begin_attribute = 'start' ,
                                                  end_attribute = 'end' )
    for start_key in strict_starts:
        begin_pos = strict_starts[start_key][0]['begin_pos']
        begin_pos_mapped = strict_starts[start_key][0]['begin_pos_mapped']
        end_pos = strict_starts[start_key][0]['end_pos']
        end_pos_mapped = strict_starts[start_key][0]['end_pos_mapped']
        ## dictionary key is set to begin_pos
        assert start_key == begin_pos
        ## mapping works for begin position
        assert begin_pos != begin_pos_mapped
        while (offset_mapping[begin_pos] == None):
            begin_pos = str(int(begin_pos) + 1)
        assert begin_pos_mapped == offset_mapping[begin_pos]
        ## mapping works for end position
        assert end_pos != end_pos_mapped
        while (offset_mapping[end_pos] == None):
            end_pos = str(int(end_pos) - 1)
        assert end_pos_mapped == offset_mapping[end_pos]
Ejemplo n.º 3
0
def test_empty_extraction_of_doc_content_from_0016_gs():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0016_gs.xml'
    ## Look for a path that doesn't exist so that we get an empty return
    test_dd = dict(cdata_xpath='/dev/null')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = test_dd ,
                                     skip_chars = r'[\s]' )
    expected_output = {}
    assert offset_mapping == expected_output
Ejemplo n.º 4
0
def test_extracting_doc_content_from_0016_gs_skip_zpipe_char():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0016_gs.xml'
    test_dd = dict(cdata_xpath='./TEXT')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = test_dd ,
                                     skip_chars = '[z|]' )
    expected_output = {
        '0': '0',
        '1': '1',
        '2': '2',
        '3': None,
        '4': None,
        '5': None,
        '6': '3'
    }
    for index in ["0", "1", "2", "3", "4", "5", "6"]:
        assert offset_mapping[index] == expected_output[index]
Ejemplo n.º 5
0
def test_extracting_doc_content_from_995723_sentences_xmi():
    ingest_file = 'tests/data/sentences/995723.sentences.xmi'
    test_dd = dict(tag_xpath='./cas:Sofa', content_attribute='sofaString')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = { 'cas' :
                                                    "http:///uima/cas.ecore" } ,
                                     document_data = test_dd ,
                                     skip_chars = r'[\s]' )
    expected_output = {
        '0': '0',
        '1': '1',
        '2': '2',
        '3': '3',
        '4': '4',
        '5': '5',
        '6': '6',
        '7': '7'
    }
    assert offset_mapping == expected_output
Ejemplo n.º 6
0
def test_brat_standoff_extraction_with_attributes():
    ingest_file = 'tests/data/brat_reference/problems_and_allergens.ann'
    document_data = dict(format='.ann .txt')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = document_data ,
                                     skip_chars = r'[\s]' )
    strict_starts = \
      text_extraction.extract_annotations_brat_standoff( ingest_file ,
                                                         offset_mapping = offset_mapping ,
                                                         type_prefix = 'T' ,
                                                         tag_name = 'Problem' ,
                                                         optional_attributes = [ 'Conditional' ,
                                                                                 'Generic' ,
                                                                                 'Historical' ,
                                                                                 'Negated' ,
                                                                                 'NotPatient' ,
                                                                                 'Uncertain' ] )
    ##
    assert strict_starts['474'][0]['begin_pos'] == '474'
    assert strict_starts['474'][0]['end_pos'] == '493'
    assert strict_starts['474'][0]['raw_text'] == 'shortness of breath'
Ejemplo n.º 7
0
def test_brat_standoff_extraction():
    ingest_file = 'tests/data/brat_reference/ibm.ann'
    document_data = dict(format='.ann .txt')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = document_data ,
                                     skip_chars = r'[\s]' )
    strict_starts = \
      text_extraction.extract_annotations_brat_standoff( ingest_file ,
                                                         offset_mapping = offset_mapping ,
                                                         type_prefix = 'T' ,
                                                         tag_name = 'Organization' )
    ##
    assert strict_starts['0'][0]['begin_pos'] == '0'
    assert strict_starts['0'][0]['end_pos'] == '43'
    assert strict_starts['0'][0][
        'raw_text'] == 'International Business Machines Corporation'
    ##
    assert strict_starts['45'][0]['begin_pos'] == '45'
    assert strict_starts['45'][0]['end_pos'] == '48'
    ##
    assert strict_starts['52'][0]['raw_text'] == 'Big Blue'
Ejemplo n.º 8
0
def count_chars_profile(reference_ns,
                        reference_dd,
                        reference_folder,
                        test_ns,
                        test_dd,
                        test_folder,
                        args,
                        file_prefix='/',
                        file_suffix='.xml'):
    log.debug("Entering '{}'".format(sys._getframe().f_code.co_name))
    """
    Extract a character profile for each document and corpus as a whole.
    """
    try:
        match_count, file_mapping = collect_files(reference_folder,
                                                  test_folder, file_prefix,
                                                  file_suffix,
                                                  args.skip_missing_files)
    except:
        e = sys.exc_info()[0]
        log.error('Uncaught exception in collect_files:  {}'.format(e))
    ##
    if (match_count == 0):
        ## Empty dictionaries evaluate to False so testing bool can tell us if
        ## any reference documents exist
        if (bool(file_mapping)):
            print('ERROR:  No documents found in test directory:  {}'.format(
                test_folder))
        else:
            print('ERROR:  No documents found in reference directory:  {}'.
                  format(reference_folder))
        return (None)
    ##
    for reference_filename in tqdm(sorted(file_mapping.keys()),
                                   file=args.progressbar_file,
                                   disable=args.progressbar_disabled):
        ##
        reference_out_file = generate_out_file(args.reference_out,
                                               reference_filename)
        ##
        try:
            reference_chars = \
              text_extraction.extract_chars( '{}/{}'.format( reference_folder ,
                                                             reference_filename ) ,
                                             namespaces = reference_ns ,
                                             document_data = reference_dd )
        except:
            e = sys.exc_info()[0]
            log.error('Uncaught exception in extract_chars:  {}'.format(e))
        test_filename = file_mapping[reference_filename]
        if (test_filename == None):
            test_chars = {}
        else:
            ##
            test_out_file = generate_out_file(args.test_out, test_filename)
            ##
            try:
                test_full_path = '{}/{}'.format(test_folder, test_filename)
                test_chars = \
                  text_extraction.extract_chars( test_full_path ,
                                                 namespaces = test_ns ,
                                                 document_data = test_dd )
            except:
                e = sys.exc_info()[0]
                log.error('Uncaught exception in extract_chars:  {}'.format(e))
        ##
    log.debug("-- Leaving '{}'".format(sys._getframe().f_code.co_name))