def test_offset_mapping_matches_pos_mapped_manually(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' document_data = dict(cdata_xpath='./TEXT') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = document_data , skip_chars = r'[\s]' ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = offset_mapping , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , begin_attribute = 'start' , end_attribute = 'end' ) ## assert strict_starts['87'][0]['begin_pos'] == '87' assert strict_starts[ '87' ][ 0 ][ 'begin_pos_mapped' ] == \ offset_mapping[ '87' ] assert strict_starts['87'][0]['end_pos'] == '97' assert strict_starts[ '87' ][ 0 ][ 'end_pos_mapped' ] == \ offset_mapping[ '97' ] ## assert strict_starts['2404'][0]['begin_pos'] == '2404' assert strict_starts[ '2404' ][ 0 ][ 'begin_pos_mapped' ] == \ offset_mapping[ '2404' ] assert strict_starts['2404'][0]['end_pos'] == '2410' assert strict_starts[ '2404' ][ 0 ][ 'end_pos_mapped' ] == \ offset_mapping[ '2409' ]
def test_offset_mapping_matches_pos_mapped_automatically(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' document_data = dict(cdata_xpath='./TEXT') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = document_data , skip_chars = r'[\s]' ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = offset_mapping , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , begin_attribute = 'start' , end_attribute = 'end' ) for start_key in strict_starts: begin_pos = strict_starts[start_key][0]['begin_pos'] begin_pos_mapped = strict_starts[start_key][0]['begin_pos_mapped'] end_pos = strict_starts[start_key][0]['end_pos'] end_pos_mapped = strict_starts[start_key][0]['end_pos_mapped'] ## dictionary key is set to begin_pos assert start_key == begin_pos ## mapping works for begin position assert begin_pos != begin_pos_mapped while (offset_mapping[begin_pos] == None): begin_pos = str(int(begin_pos) + 1) assert begin_pos_mapped == offset_mapping[begin_pos] ## mapping works for end position assert end_pos != end_pos_mapped while (offset_mapping[end_pos] == None): end_pos = str(int(end_pos) - 1) assert end_pos_mapped == offset_mapping[end_pos]
def test_empty_extraction_of_doc_content_from_0016_gs(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0016_gs.xml' ## Look for a path that doesn't exist so that we get an empty return test_dd = dict(cdata_xpath='/dev/null') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = test_dd , skip_chars = r'[\s]' ) expected_output = {} assert offset_mapping == expected_output
def test_extracting_doc_content_from_0016_gs_skip_zpipe_char(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0016_gs.xml' test_dd = dict(cdata_xpath='./TEXT') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = test_dd , skip_chars = '[z|]' ) expected_output = { '0': '0', '1': '1', '2': '2', '3': None, '4': None, '5': None, '6': '3' } for index in ["0", "1", "2", "3", "4", "5", "6"]: assert offset_mapping[index] == expected_output[index]
def test_extracting_doc_content_from_995723_sentences_xmi(): ingest_file = 'tests/data/sentences/995723.sentences.xmi' test_dd = dict(tag_xpath='./cas:Sofa', content_attribute='sofaString') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = { 'cas' : "http:///uima/cas.ecore" } , document_data = test_dd , skip_chars = r'[\s]' ) expected_output = { '0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7' } assert offset_mapping == expected_output
def test_brat_standoff_extraction_with_attributes(): ingest_file = 'tests/data/brat_reference/problems_and_allergens.ann' document_data = dict(format='.ann .txt') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = document_data , skip_chars = r'[\s]' ) strict_starts = \ text_extraction.extract_annotations_brat_standoff( ingest_file , offset_mapping = offset_mapping , type_prefix = 'T' , tag_name = 'Problem' , optional_attributes = [ 'Conditional' , 'Generic' , 'Historical' , 'Negated' , 'NotPatient' , 'Uncertain' ] ) ## assert strict_starts['474'][0]['begin_pos'] == '474' assert strict_starts['474'][0]['end_pos'] == '493' assert strict_starts['474'][0]['raw_text'] == 'shortness of breath'
def test_brat_standoff_extraction(): ingest_file = 'tests/data/brat_reference/ibm.ann' document_data = dict(format='.ann .txt') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = document_data , skip_chars = r'[\s]' ) strict_starts = \ text_extraction.extract_annotations_brat_standoff( ingest_file , offset_mapping = offset_mapping , type_prefix = 'T' , tag_name = 'Organization' ) ## assert strict_starts['0'][0]['begin_pos'] == '0' assert strict_starts['0'][0]['end_pos'] == '43' assert strict_starts['0'][0][ 'raw_text'] == 'International Business Machines Corporation' ## assert strict_starts['45'][0]['begin_pos'] == '45' assert strict_starts['45'][0]['end_pos'] == '48' ## assert strict_starts['52'][0]['raw_text'] == 'Big Blue'
def count_chars_profile(reference_ns, reference_dd, reference_folder, test_ns, test_dd, test_folder, args, file_prefix='/', file_suffix='.xml'): log.debug("Entering '{}'".format(sys._getframe().f_code.co_name)) """ Extract a character profile for each document and corpus as a whole. """ try: match_count, file_mapping = collect_files(reference_folder, test_folder, file_prefix, file_suffix, args.skip_missing_files) except: e = sys.exc_info()[0] log.error('Uncaught exception in collect_files: {}'.format(e)) ## if (match_count == 0): ## Empty dictionaries evaluate to False so testing bool can tell us if ## any reference documents exist if (bool(file_mapping)): print('ERROR: No documents found in test directory: {}'.format( test_folder)) else: print('ERROR: No documents found in reference directory: {}'. format(reference_folder)) return (None) ## for reference_filename in tqdm(sorted(file_mapping.keys()), file=args.progressbar_file, disable=args.progressbar_disabled): ## reference_out_file = generate_out_file(args.reference_out, reference_filename) ## try: reference_chars = \ text_extraction.extract_chars( '{}/{}'.format( reference_folder , reference_filename ) , namespaces = reference_ns , document_data = reference_dd ) except: e = sys.exc_info()[0] log.error('Uncaught exception in extract_chars: {}'.format(e)) test_filename = file_mapping[reference_filename] if (test_filename == None): test_chars = {} else: ## test_out_file = generate_out_file(args.test_out, test_filename) ## try: test_full_path = '{}/{}'.format(test_folder, test_filename) test_chars = \ text_extraction.extract_chars( test_full_path , namespaces = test_ns , document_data = test_dd ) except: e = sys.exc_info()[0] log.error('Uncaught exception in extract_chars: {}'.format(e)) ## log.debug("-- Leaving '{}'".format(sys._getframe().f_code.co_name))