Python extract_annotations Examples, text_extraction.extract_annotations Python Examples

Example #1

0

Show file

def test_writing_dictionary_for_datetime_from_0005_gs():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    reference_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml'
    config_file = 'config/i2b2_2016_track-1.conf'
    try:
        tmp_descriptor, tmp_file = tempfile.mkstemp()
        os.close(tmp_descriptor)
        namespaces , document_data , patterns = \
          args_and_configs.process_config( config_file = config_file ,
                                           score_key = 'Short Name' ,
                                           score_values = [ '.*' ] )
        text_extraction.extract_annotations(ingest_file,
                                            namespaces=namespaces,
                                            document_data=document_data,
                                            patterns=patterns,
                                            skip_chars=r'[\s]',
                                            out_file=tmp_file)
        with open(reference_file, 'r') as rf:
            reloaded_reference = json.load(rf)
        with open(tmp_file, 'r') as tf:
            reloaded_test = json.load(tf)
        assert reloaded_reference['annotations'] == reloaded_test[
            'annotations']
        assert reloaded_reference['offset_mapping'] == reloaded_test[
            'offset_mapping']
        assert reloaded_reference['raw_content'] == reloaded_test[
            'raw_content']
    finally:
        os.remove(tmp_file)

Example #2

0

Show file

def test_contents_of_write_of_dictionary_for_brat_patterns():
    ingest_file = 'tests/data/brat_reference/problems_and_allergens.ann'
    config_file = 'config/brat_problems_allergies_standoff.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    with tempfile.NamedTemporaryFile() as tmpfile_handle:
        assert os.path.exists(tmpfile_handle.name)
        offset_mapping , strict_starts = \
          text_extraction.extract_annotations( ingest_file ,
                                               namespaces = namespaces ,
                                               document_data = document_data ,
                                               patterns = patterns ,
                                               skip_chars = r'[\s]' ,
                                               out_file = tmpfile_handle.name )
        reloaded_json = json.load(tmpfile_handle)
        assert reloaded_json['annotations'] == strict_starts
        ## T34	Problem 474 493	shortness of breath
        ## A1	Negated T34
        assert strict_starts['474'][0]['begin_pos'] == '474'
        assert strict_starts['474'][0]['end_pos'] == '493'
        assert strict_starts['474'][0]['raw_text'] == 'shortness of breath'
        assert strict_starts['474'][0]['Historical'] == 'false'
        assert strict_starts['474'][0]['Negated'] == 'true'
        assert os.path.exists(tmpfile_handle.name)
    assert os.path.exists(tmpfile_handle.name) == False

Example #3

0

Show file

def test_of_presaved_dictionary_for_complex_patterns():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    presaved_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml'
    config_file = 'config/i2b2_2016_track-1.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    with open(presaved_file, 'r') as fp:
        reloaded_json = json.load(fp)
    offset_mapping , strict_starts = \
      text_extraction.extract_annotations( ingest_file ,
                                           namespaces = namespaces ,
                                           document_data = document_data ,
                                           patterns = patterns ,
                                           skip_chars = r'[\s]' ,
                                           out_file = None )
    assert reloaded_json['annotations'] == strict_starts

Example #4

0

Show file

def test_of_identity_read_write_of_dictionary_for_complex_patterns():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    config_file = 'config/i2b2_2016_track-1.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    with tempfile.NamedTemporaryFile() as tmpfile_handle:
        assert os.path.exists(tmpfile_handle.name)
        offset_mapping , strict_starts = \
          text_extraction.extract_annotations( ingest_file ,
                                               namespaces = namespaces ,
                                               document_data = document_data ,
                                               patterns = patterns ,
                                               skip_chars = r'[\s]' ,
                                               out_file = tmpfile_handle.name )
        reloaded_json = json.load(tmpfile_handle)
        assert reloaded_json['annotations'] == strict_starts
        assert os.path.exists(tmpfile_handle.name)
    assert os.path.exists(tmpfile_handle.name) == False

Example #5

0

Show file

def test_empty_contents_of_write_of_dictionary_for_brat_patterns():
    ingest_file = 'tests/data/brat_reference/ibm.ann'
    config_file = 'config/brat_problems_allergies_standoff.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    with tempfile.NamedTemporaryFile() as tmpfile_handle:
        assert os.path.exists(tmpfile_handle.name)
        offset_mapping , strict_starts = \
          text_extraction.extract_annotations( ingest_file ,
                                               namespaces = namespaces ,
                                               document_data = document_data ,
                                               patterns = patterns ,
                                               skip_chars = r'[\s]' ,
                                               out_file = tmpfile_handle.name )
        assert strict_starts == {}
        assert os.path.exists(tmpfile_handle.name)
        with open(tmpfile_handle.name, 'r') as rf:
            reloaded_out_file = json.load(rf)
        assert reloaded_out_file["annotations"] == {}
        assert reloaded_out_file[
            "raw_content"] == "International Business Machines Corporation: IBM is Big Blue\n"
    assert os.path.exists(tmpfile_handle.name) == False

Example #6

0

Show file

File: etude.py Project: MUSC-TBIC/etude-engine

def score_ref_set(reference_ns,
                  reference_dd,
                  reference_patterns,
                  reference_folder,
                  test_ns,
                  test_dd,
                  test_patterns,
                  test_folder,
                  args,
                  file_prefix='/',
                  file_suffix='.xml'):
    log.debug("Entering '{}'".format(sys._getframe().f_code.co_name))
    """
    Score the system output (test) folder against the reference folder.
    """
    score_card = scoring_metrics.new_score_card( fuzzy_flags = \
                                                   args.fuzzy_flags ,
                                                 normalization_engines = \
                                                   args.scorable_engines )
    ##
    confusion_matrix = {}
    ##########################
    file_mapping = get_file_mapping(reference_folder, test_folder, file_prefix,
                                    file_suffix, args.skip_missing_files)
    if (file_mapping == None):
        ## There was a problem mapping files between directories so abort
        return (None)
    ##########################
    create_output_folders(args.reference_out, args.test_out)
    ##########################
    for reference_filename in tqdm(sorted(file_mapping.keys()),
                                   file=args.progressbar_file,
                                   disable=args.progressbar_disabled):
        ##
        reference_out_file = generate_out_file(args.reference_out,
                                               reference_filename)
        ##
        try:
            reference_full_path = '{}/{}'.format(reference_folder,
                                                 reference_filename)
            reference_om , reference_ss = \
              text_extraction.extract_annotations( reference_full_path ,
                                                   namespaces = reference_ns ,
                                                   document_data = reference_dd ,
                                                   patterns = reference_patterns ,
                                                   skip_chars = args.skip_chars ,
                                                   out_file = reference_out_file )
        except TypeError as e:
            log.error(
                'TypeError exception in extract_annotations:  {}'.format(e))
        except:
            e = sys.exc_info()[0]
            log.error(
                'Uncaught exception in extract_annotations:  {}'.format(e))
        test_filename = file_mapping[reference_filename]
        if (test_filename == None):
            test_om = {}
            test_ss = {}
        else:
            ##
            test_out_file = generate_out_file(args.test_out, test_filename)
            ##
            test_full_path = '{}/{}'.format(test_folder, test_filename)
            try:
                test_om , test_ss = \
                  text_extraction.extract_annotations( test_full_path ,
                                                       namespaces = test_ns ,
                                                       document_data = test_dd ,
                                                       patterns = test_patterns ,
                                                       skip_chars = \
                                                         args.skip_chars ,
                                                       out_file = test_out_file )
            except TypeError as e:
                log.error(
                    'TypeError exception in extract_annotations:  {}'.format(
                        e))
            except:
                e = sys.exc_info()[0]
                log.error(
                    'Uncaught exception in extract_annotations:  {}'.format(e))
        ##
        try:
            for fuzzy_flag in args.fuzzy_flags:
                if (args.skip_chars == None):
                    ignore_chars = False
                else:
                    ignore_chars = True
                scoring_metrics.evaluate_positions( reference_filename ,
                                                    confusion_matrix ,
                                                    score_card ,
                                                    reference_ss ,
                                                    test_ss ,
                                                    fuzzy_flag = fuzzy_flag ,
                                                    use_mapped_chars = \
                                                      ignore_chars ,
                                                    scorable_attributes = \
                                                      args.scorable_attributes ,
                                                    scorable_engines = \
                                                      args.scorable_engines ,
                                                    norm_synonyms =\
                                                      args.normalization_synonyms )
        except UnboundLocalError as e:
            log.error('UnboundLocalError exception in evaluate_positions:  {}'.
                      format(e))
        except NameError as e:
            log.error(
                'NameError exception in evaluate_positions:  {}'.format(e))
        except TypeError as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            log.error('TypeError in evaluate_positions ({}):  {}'.format(
                exc_tb.tb_lineno, e))
        except:
            e = sys.exc_info()[0]
            log.error(
                'Uncaught exception in evaluate_positions:  {}'.format(e))
    ##
    if (args.csv_out and os.path.exists(args.csv_out)):
        os.remove(args.csv_out)
    ##
    # scoring_metrics.print_counts_summary_shell( confusion_matrix ,
    #                                             file_mapping ,
    #                                             reference_patterns , test_patterns ,
    #                                             args = args )
    if (args.print_confusion_matrix):
        scoring_metrics.print_confusion_matrix_shell(confusion_matrix,
                                                     file_mapping,
                                                     reference_patterns,
                                                     test_patterns,
                                                     args=args)
    if (args.print_metrics):
        scoring_metrics.print_score_summary_shell(score_card,
                                                  file_mapping,
                                                  reference_patterns,
                                                  test_patterns,
                                                  args=args)
    if ('2018 n2c2 track 1' in args.print_custom):
        scoring_metrics.print_2018_n2c2_track1(score_card,
                                               file_mapping,
                                               args=args)
    #########
    log.debug("-- Leaving '{}'".format(sys._getframe().f_code.co_name))

Example #7

0

Show file

File: etude.py Project: MUSC-TBIC/etude-engine

def count_ref_set(this_ns,
                  this_dd,
                  this_patterns,
                  this_folder,
                  args,
                  file_prefix='/',
                  file_suffix='.xml',
                  set_type=None):
    log.debug("Entering '{}'".format(sys._getframe().f_code.co_name))
    """
    Count annotation occurrences in the test folder
    """
    type_counts = scoring_metrics.new_score_card(fuzzy_flags=['counts'])
    file_list = set([
        os.path.basename(x)
        for x in glob.glob(this_folder + file_prefix + '*' + file_suffix)
    ])
    ##########################
    for this_filename in tqdm(sorted(file_list),
                              file=args.progressbar_file,
                              disable=args.progressbar_disabled):
        try:
            this_full_path = '{}/{}'.format(this_folder, this_filename)
            this_om , this_ss = \
              text_extraction.extract_annotations( this_full_path ,
                                                   namespaces = this_ns ,
                                                   document_data = this_dd ,
                                                   patterns = this_patterns ,
                                                   out_file = None )
        except NameError as e:
            log.error(
                'NameError exception in extract_annotations:  {}'.format(e))
        except TypeError as e:
            log.error(
                'TypeError exception in extract_annotations:  {}'.format(e))
        except:
            e = sys.exc_info()[0]
            log.error(
                'Uncaught exception in extract_annotations:  {}'.format(e))
        for this_start in this_ss:
            ## loop over all entries sharing the same start position
            ## and grab type and end position
            for this_entry in this_ss[this_start]:
                this_type = this_entry['type']
                if (this_start == -1):
                    this_end = -1
                    sub_type = this_entry['pivot_value']
                    ## TODO - don't force the pivot value into the attribute name
                    this_type = '{} = "{}"'.format(this_type,
                                                   this_entry['pivot_value'])
                else:
                    this_end = this_entry['end_pos']
                    sub_type = None
                ##
                ##print( '{}\n'.format( this_type ) )
                scoring_metrics.update_score_card('Tally',
                                                  type_counts,
                                                  'counts',
                                                  this_filename,
                                                  this_start,
                                                  this_end,
                                                  this_type,
                                                  pivot_value=sub_type)

    ##
    if (args.csv_out and os.path.exists(args.csv_out)):
        os.remove(args.csv_out)
    ##
    try:
        scoring_metrics.print_counts_summary(type_counts,
                                             sorted(file_list),
                                             this_patterns,
                                             args,
                                             set_type=set_type)
    except AttributeError as e:
        log.error(
            'AttributeError exception in print_counts_summary:  {}'.format(e))
    except KeyError as e:
        log.error('KeyError exception in print_counts_summary:  {}'.format(e))
    except NameError as e:
        log.error('NameError exception in print_counts_summary:  {}'.format(e))
    except TypeError as e:
        log.error('TypeError exception in print_counts_summary:  {}'.format(e))
    except:
        e = sys.exc_info()[0]
        log.error('Uncaught exception in print_counts_summary:  {}'.format(e))
    #########
    log.debug("-- Leaving '{}'".format(sys._getframe().f_code.co_name))

Example #8

0

Show file

def test_extract_annotations_overlapping_in_same_file():
    ingest_file = 'tests/data/offset_matching/the_doctors_age_overlapping.xmi'
    namespaces = {
        'cas': "http:///uima/cas.ecore",
        'custom': "http:///webanno/custom.ecore"
    }
    document_data = dict(tag_xpath='./cas:Sofa',
                         content_attribute='sofaString')
    patterns = [{
        'type': 'Age',
        'xpath': './custom:PHI[@Time="Age"]',
        'display_name': 'Age',
        'short_name': 'Age',
        'long_name': 'Age',
        'optional_attributes': [],
        'begin_attr': 'begin',
        'end_attr': 'end'
    }, {
        'type': 'DateTime',
        'xpath': './custom:PHI[@Time="DateTime"]',
        'display_name': 'DateTime',
        'short_name': 'DateTime',
        'long_name': 'DateTime',
        'optional_attributes': [],
        'begin_attr': 'begin',
        'end_attr': 'end'
    }, {
        'type': 'Number',
        'xpath': './custom:PHI[@Time="Number"]',
        'display_name': 'Number',
        'short_name': 'Number',
        'long_name': 'Number',
        'optional_attributes': [],
        'begin_attr': 'begin',
        'end_attr': 'end'
    }]
    offset_mapping , annots = \
      text_extraction.extract_annotations( ingest_file ,
                                           namespaces = namespaces ,
                                           document_data = document_data ,
                                           patterns = patterns ,
                                           skip_chars = None ,
                                           out_file = None )
    expected_annots = {
        '24': [{
            'type': 'Age',
            'end_pos': '27',
            'raw_text': None,
            'begin_pos': '24'
        }, {
            'type': 'Number',
            'end_pos': '27',
            'raw_text': None,
            'begin_pos': '24'
        }],
        '41': [{
            'type': 'DateTime',
            'end_pos': '59',
            'raw_text': None,
            'begin_pos': '41'
        }, {
            'type': 'DateTime',
            'end_pos': '54',
            'raw_text': None,
            'begin_pos': '41'
        }]
    }
    assert annots == expected_annots

Example #9

0

Show file

def test_extracting_with_and_without_optional_attributes_called_by_parent():
    ingest_file = 'tests/data/013_Conditional_Problem.xmi'
    config_file = 'config/webanno_problems_allergies_xmi.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    patterns.pop()
    offset_mapping , annots_with_opt_attributes = \
      text_extraction.extract_annotations( ingest_file ,
                                           namespaces = namespaces ,
                                           document_data = document_data ,
                                           patterns = patterns ,
                                           skip_chars = None ,
                                           out_file = None )
    patterns[0]['optional_attributes'] = []
    offset_mapping , annots_without_opt_attributes = \
      text_extraction.extract_annotations( ingest_file ,
                                           namespaces = namespaces ,
                                           document_data = document_data ,
                                           patterns = patterns ,
                                           skip_chars = None ,
                                           out_file = None )
    expected_output_without_opt_attributes = \
      { '181' :  [ { 'type': 'Problem' ,
                      'begin_pos': '181' ,
                      'end_pos': '188' ,
                      'raw_text': None } ] ,
        '218' : [ { 'type': 'Problem' ,
                   'begin_pos': '218' ,
                   'end_pos': '224' ,
                   'raw_text': None } ]
      }
    expected_output_with_opt_attributes = \
      { '181' :  [ { 'type': 'Problem' ,
                     'begin_pos': '181' ,
                     'end_pos': '188' ,
                     'raw_text': None ,
                     'conditional' : 'true' ,
                     'generic' : 'false' ,
                     'historical' : 'false' ,
                     'negated' : 'false' ,
                     'not_patient' : 'true' ,
                     'uncertain' : 'false' } ] ,
        '218' : [ { 'type': 'Problem' ,
                    'begin_pos': '218' ,
                    'end_pos': '224' ,
                    'raw_text': None ,
                    'conditional' : 'false' ,
                    'generic' : 'false' ,
                    'historical' : 'true' ,
                    'negated' : 'false' ,
                    'not_patient' : 'false' ,
                    'uncertain' : 'true' } ]
      }
    assert annots_with_opt_attributes == \
        expected_output_with_opt_attributes
    assert annots_without_opt_attributes == \
        expected_output_without_opt_attributes
    assert annots_with_opt_attributes != \
        expected_output_without_opt_attributes
    assert annots_without_opt_attributes != \
        expected_output_with_opt_attributes