Ejemplo n.º 1
0
 def test_read_umls_definitions(self):
     meta_path = 'tests/fixtures/umls_META'
     concept_details = {}
     umls_utils.read_umls_concepts(meta_path, concept_details)
     umls_utils.read_umls_definitions(meta_path, concept_details)
     for expected_concept in self.expected_concepts:
         concept = concept_details[expected_concept['concept_id']]
         if 'definition' in expected_concept:
             assert concept['definition'] == expected_concept['definition']
Ejemplo n.º 2
0
def main(meta_path: str, output_path: str, source: str = None):

    concept_details = {}  # dictionary of concept_id -> {
                          #                 'concept_id': str,
                          #                 'canonical_name': str
                          #                 'aliases': List[str]
                          #                 'types': List[str]
                          #                 'definition': str
                          # }

    print('Reading concepts ... ')
    umls_utils.read_umls_concepts(meta_path, concept_details, source)

    print('Reading types ... ')
    umls_utils.read_umls_types(meta_path, concept_details)

    print('Reading definitions ... ')
    umls_utils.read_umls_definitions(meta_path, concept_details)

    without_canonical_name_count = 0
    without_aliases_count = 0
    with_one_alias_count = 0
    with_more_than_one_alias_count = 0
    without_type_count = 0
    with_one_type_count = 0
    with_more_than_one_type_count = 0
    without_definition_count = 0
    with_definition_pref_source_count = 0
    with_definition_other_sources_count = 0
    for concept in concept_details.values():
        without_canonical_name_count += 1 if 'canonical_name' not in concept else 0
        without_aliases_count += 1 if len(concept['aliases']) == 0 else 0
        with_one_alias_count += 1 if len(concept['aliases']) == 1 else 0
        with_more_than_one_alias_count += 1 if len(concept['aliases']) > 1 else 0
        without_type_count += 1 if len(concept['types']) == 0 else 0
        with_one_type_count += 1 if len(concept['types']) == 1 else 0
        with_more_than_one_type_count += 1 if len(concept['types']) >= 1 else 0
        without_definition_count += 1 if 'definition' not in concept else 0
        with_definition_pref_source_count += 1 if concept.get('is_from_preferred_source') == 'Y' else 0
        with_definition_other_sources_count += 1 if concept.get('is_from_preferred_source') == 'N' else 0

    print(f'Number of concepts: {len(concept_details)}')
    print(f'Number of concepts without canonical name (one of the aliases will be used instead): '
          f'{without_canonical_name_count}')
    print(f'Number of concepts with no aliases: {without_aliases_count}')
    print(f'Number of concepts with 1 alias: {with_one_alias_count}')
    print(f'Number of concepts with > 1 alias: {with_more_than_one_alias_count}')
    print(f'Number of concepts with no type: {without_type_count}')
    print(f'Number of concepts with 1 type: {with_one_type_count}')
    print(f'Number of concepts with > 1 type: {with_more_than_one_type_count}')
    print(f'Number of concepts with no definition: {without_definition_count}')
    print(f'Number of concepts with definition from preferred sources: {with_definition_pref_source_count}')
    print(f'Number of concepts with definition from other sources: {with_definition_other_sources_count}')

    print('Deleting unused fields and choosing a canonical name from aliases ... ')
    for concept in concept_details.values():

        # Some concepts have many duplicate aliases. Here we remove them.
        concept["aliases"] = list(set(concept["aliases"]))

        # if a concept doesn't have a canonical name, use the first alias instead
        if 'canonical_name' not in concept:
            aliases = concept['aliases']
            concept['canonical_name'] = aliases[0]
            del aliases[0]

        # deleting `is_from_preferred_source`
        if 'is_from_preferred_source' in concept:
            del concept['is_from_preferred_source']

    print('Exporting to the a jsonl file {} ...'.format(output_path))
    with open(output_path, 'w') as fout:

        for value in concept_details.values():
            fout.write(json.dumps(value) + "\n")
    print('DONE.')