def test_writing_dictionary_for_datetime_from_0005_gs(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' reference_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml' config_file = 'config/i2b2_2016_track-1.conf' try: tmp_descriptor, tmp_file = tempfile.mkstemp() os.close(tmp_descriptor) namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) text_extraction.extract_annotations(ingest_file, namespaces=namespaces, document_data=document_data, patterns=patterns, skip_chars=r'[\s]', out_file=tmp_file) with open(reference_file, 'r') as rf: reloaded_reference = json.load(rf) with open(tmp_file, 'r') as tf: reloaded_test = json.load(tf) assert reloaded_reference['annotations'] == reloaded_test[ 'annotations'] assert reloaded_reference['offset_mapping'] == reloaded_test[ 'offset_mapping'] assert reloaded_reference['raw_content'] == reloaded_test[ 'raw_content'] finally: os.remove(tmp_file)
def test_contents_of_write_of_dictionary_for_brat_patterns(): ingest_file = 'tests/data/brat_reference/problems_and_allergens.ann' config_file = 'config/brat_problems_allergies_standoff.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with tempfile.NamedTemporaryFile() as tmpfile_handle: assert os.path.exists(tmpfile_handle.name) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = tmpfile_handle.name ) reloaded_json = json.load(tmpfile_handle) assert reloaded_json['annotations'] == strict_starts ## T34 Problem 474 493 shortness of breath ## A1 Negated T34 assert strict_starts['474'][0]['begin_pos'] == '474' assert strict_starts['474'][0]['end_pos'] == '493' assert strict_starts['474'][0]['raw_text'] == 'shortness of breath' assert strict_starts['474'][0]['Historical'] == 'false' assert strict_starts['474'][0]['Negated'] == 'true' assert os.path.exists(tmpfile_handle.name) assert os.path.exists(tmpfile_handle.name) == False
def test_of_presaved_dictionary_for_complex_patterns(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' presaved_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml' config_file = 'config/i2b2_2016_track-1.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with open(presaved_file, 'r') as fp: reloaded_json = json.load(fp) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = None ) assert reloaded_json['annotations'] == strict_starts
def test_of_identity_read_write_of_dictionary_for_complex_patterns(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' config_file = 'config/i2b2_2016_track-1.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with tempfile.NamedTemporaryFile() as tmpfile_handle: assert os.path.exists(tmpfile_handle.name) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = tmpfile_handle.name ) reloaded_json = json.load(tmpfile_handle) assert reloaded_json['annotations'] == strict_starts assert os.path.exists(tmpfile_handle.name) assert os.path.exists(tmpfile_handle.name) == False
def test_empty_contents_of_write_of_dictionary_for_brat_patterns(): ingest_file = 'tests/data/brat_reference/ibm.ann' config_file = 'config/brat_problems_allergies_standoff.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with tempfile.NamedTemporaryFile() as tmpfile_handle: assert os.path.exists(tmpfile_handle.name) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = tmpfile_handle.name ) assert strict_starts == {} assert os.path.exists(tmpfile_handle.name) with open(tmpfile_handle.name, 'r') as rf: reloaded_out_file = json.load(rf) assert reloaded_out_file["annotations"] == {} assert reloaded_out_file[ "raw_content"] == "International Business Machines Corporation: IBM is Big Blue\n" assert os.path.exists(tmpfile_handle.name) == False
def score_ref_set(reference_ns, reference_dd, reference_patterns, reference_folder, test_ns, test_dd, test_patterns, test_folder, args, file_prefix='/', file_suffix='.xml'): log.debug("Entering '{}'".format(sys._getframe().f_code.co_name)) """ Score the system output (test) folder against the reference folder. """ score_card = scoring_metrics.new_score_card( fuzzy_flags = \ args.fuzzy_flags , normalization_engines = \ args.scorable_engines ) ## confusion_matrix = {} ########################## file_mapping = get_file_mapping(reference_folder, test_folder, file_prefix, file_suffix, args.skip_missing_files) if (file_mapping == None): ## There was a problem mapping files between directories so abort return (None) ########################## create_output_folders(args.reference_out, args.test_out) ########################## for reference_filename in tqdm(sorted(file_mapping.keys()), file=args.progressbar_file, disable=args.progressbar_disabled): ## reference_out_file = generate_out_file(args.reference_out, reference_filename) ## try: reference_full_path = '{}/{}'.format(reference_folder, reference_filename) reference_om , reference_ss = \ text_extraction.extract_annotations( reference_full_path , namespaces = reference_ns , document_data = reference_dd , patterns = reference_patterns , skip_chars = args.skip_chars , out_file = reference_out_file ) except TypeError as e: log.error( 'TypeError exception in extract_annotations: {}'.format(e)) except: e = sys.exc_info()[0] log.error( 'Uncaught exception in extract_annotations: {}'.format(e)) test_filename = file_mapping[reference_filename] if (test_filename == None): test_om = {} test_ss = {} else: ## test_out_file = generate_out_file(args.test_out, test_filename) ## test_full_path = '{}/{}'.format(test_folder, test_filename) try: test_om , test_ss = \ text_extraction.extract_annotations( test_full_path , namespaces = test_ns , document_data = test_dd , patterns = test_patterns , skip_chars = \ args.skip_chars , out_file = test_out_file ) except TypeError as e: log.error( 'TypeError exception in extract_annotations: {}'.format( e)) except: e = sys.exc_info()[0] log.error( 'Uncaught exception in extract_annotations: {}'.format(e)) ## try: for fuzzy_flag in args.fuzzy_flags: if (args.skip_chars == None): ignore_chars = False else: ignore_chars = True scoring_metrics.evaluate_positions( reference_filename , confusion_matrix , score_card , reference_ss , test_ss , fuzzy_flag = fuzzy_flag , use_mapped_chars = \ ignore_chars , scorable_attributes = \ args.scorable_attributes , scorable_engines = \ args.scorable_engines , norm_synonyms =\ args.normalization_synonyms ) except UnboundLocalError as e: log.error('UnboundLocalError exception in evaluate_positions: {}'. format(e)) except NameError as e: log.error( 'NameError exception in evaluate_positions: {}'.format(e)) except TypeError as e: exc_type, exc_obj, exc_tb = sys.exc_info() log.error('TypeError in evaluate_positions ({}): {}'.format( exc_tb.tb_lineno, e)) except: e = sys.exc_info()[0] log.error( 'Uncaught exception in evaluate_positions: {}'.format(e)) ## if (args.csv_out and os.path.exists(args.csv_out)): os.remove(args.csv_out) ## # scoring_metrics.print_counts_summary_shell( confusion_matrix , # file_mapping , # reference_patterns , test_patterns , # args = args ) if (args.print_confusion_matrix): scoring_metrics.print_confusion_matrix_shell(confusion_matrix, file_mapping, reference_patterns, test_patterns, args=args) if (args.print_metrics): scoring_metrics.print_score_summary_shell(score_card, file_mapping, reference_patterns, test_patterns, args=args) if ('2018 n2c2 track 1' in args.print_custom): scoring_metrics.print_2018_n2c2_track1(score_card, file_mapping, args=args) ######### log.debug("-- Leaving '{}'".format(sys._getframe().f_code.co_name))
def count_ref_set(this_ns, this_dd, this_patterns, this_folder, args, file_prefix='/', file_suffix='.xml', set_type=None): log.debug("Entering '{}'".format(sys._getframe().f_code.co_name)) """ Count annotation occurrences in the test folder """ type_counts = scoring_metrics.new_score_card(fuzzy_flags=['counts']) file_list = set([ os.path.basename(x) for x in glob.glob(this_folder + file_prefix + '*' + file_suffix) ]) ########################## for this_filename in tqdm(sorted(file_list), file=args.progressbar_file, disable=args.progressbar_disabled): try: this_full_path = '{}/{}'.format(this_folder, this_filename) this_om , this_ss = \ text_extraction.extract_annotations( this_full_path , namespaces = this_ns , document_data = this_dd , patterns = this_patterns , out_file = None ) except NameError as e: log.error( 'NameError exception in extract_annotations: {}'.format(e)) except TypeError as e: log.error( 'TypeError exception in extract_annotations: {}'.format(e)) except: e = sys.exc_info()[0] log.error( 'Uncaught exception in extract_annotations: {}'.format(e)) for this_start in this_ss: ## loop over all entries sharing the same start position ## and grab type and end position for this_entry in this_ss[this_start]: this_type = this_entry['type'] if (this_start == -1): this_end = -1 sub_type = this_entry['pivot_value'] ## TODO - don't force the pivot value into the attribute name this_type = '{} = "{}"'.format(this_type, this_entry['pivot_value']) else: this_end = this_entry['end_pos'] sub_type = None ## ##print( '{}\n'.format( this_type ) ) scoring_metrics.update_score_card('Tally', type_counts, 'counts', this_filename, this_start, this_end, this_type, pivot_value=sub_type) ## if (args.csv_out and os.path.exists(args.csv_out)): os.remove(args.csv_out) ## try: scoring_metrics.print_counts_summary(type_counts, sorted(file_list), this_patterns, args, set_type=set_type) except AttributeError as e: log.error( 'AttributeError exception in print_counts_summary: {}'.format(e)) except KeyError as e: log.error('KeyError exception in print_counts_summary: {}'.format(e)) except NameError as e: log.error('NameError exception in print_counts_summary: {}'.format(e)) except TypeError as e: log.error('TypeError exception in print_counts_summary: {}'.format(e)) except: e = sys.exc_info()[0] log.error('Uncaught exception in print_counts_summary: {}'.format(e)) ######### log.debug("-- Leaving '{}'".format(sys._getframe().f_code.co_name))
def test_extract_annotations_overlapping_in_same_file(): ingest_file = 'tests/data/offset_matching/the_doctors_age_overlapping.xmi' namespaces = { 'cas': "http:///uima/cas.ecore", 'custom': "http:///webanno/custom.ecore" } document_data = dict(tag_xpath='./cas:Sofa', content_attribute='sofaString') patterns = [{ 'type': 'Age', 'xpath': './custom:PHI[@Time="Age"]', 'display_name': 'Age', 'short_name': 'Age', 'long_name': 'Age', 'optional_attributes': [], 'begin_attr': 'begin', 'end_attr': 'end' }, { 'type': 'DateTime', 'xpath': './custom:PHI[@Time="DateTime"]', 'display_name': 'DateTime', 'short_name': 'DateTime', 'long_name': 'DateTime', 'optional_attributes': [], 'begin_attr': 'begin', 'end_attr': 'end' }, { 'type': 'Number', 'xpath': './custom:PHI[@Time="Number"]', 'display_name': 'Number', 'short_name': 'Number', 'long_name': 'Number', 'optional_attributes': [], 'begin_attr': 'begin', 'end_attr': 'end' }] offset_mapping , annots = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = None , out_file = None ) expected_annots = { '24': [{ 'type': 'Age', 'end_pos': '27', 'raw_text': None, 'begin_pos': '24' }, { 'type': 'Number', 'end_pos': '27', 'raw_text': None, 'begin_pos': '24' }], '41': [{ 'type': 'DateTime', 'end_pos': '59', 'raw_text': None, 'begin_pos': '41' }, { 'type': 'DateTime', 'end_pos': '54', 'raw_text': None, 'begin_pos': '41' }] } assert annots == expected_annots
def test_extracting_with_and_without_optional_attributes_called_by_parent(): ingest_file = 'tests/data/013_Conditional_Problem.xmi' config_file = 'config/webanno_problems_allergies_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) patterns.pop() offset_mapping , annots_with_opt_attributes = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = None , out_file = None ) patterns[0]['optional_attributes'] = [] offset_mapping , annots_without_opt_attributes = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = None , out_file = None ) expected_output_without_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None } ] } expected_output_with_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None , 'conditional' : 'true' , 'generic' : 'false' , 'historical' : 'false' , 'negated' : 'false' , 'not_patient' : 'true' , 'uncertain' : 'false' } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None , 'conditional' : 'false' , 'generic' : 'false' , 'historical' : 'true' , 'negated' : 'false' , 'not_patient' : 'false' , 'uncertain' : 'true' } ] } assert annots_with_opt_attributes == \ expected_output_with_opt_attributes assert annots_without_opt_attributes == \ expected_output_without_opt_attributes assert annots_with_opt_attributes != \ expected_output_without_opt_attributes assert annots_without_opt_attributes != \ expected_output_with_opt_attributes