Beispiel #1
0
def get_node_curie_id_from_ontology_node_id(ontology_node_id: str,
                                            ontology: ontobio.ontol.Ontology,
                                            uri_to_curie_shortener: callable):
    node_curie_id = None
    if not ontology_node_id.startswith(
            'http:') and not ontology_node_id.startswith('https:'):
        if not ontology_node_id.startswith('OBO:'):
            if not ontology_node_id.startswith('UMLS:C'):
                node_curie_id = ontology_node_id
            else:
                node_curie_id = CUI_PREFIX + ':' + ontology_node_id.split(
                    'UMLS:')[1]
        else:
            node_curie_id = uri_to_curie_shortener(
                prefixcommons.expand_uri(ontology_node_id))
    else:
        node_curie_id = uri_to_curie_shortener(ontology_node_id)
        if node_curie_id is None:
            kg2_util.log_message(
                message="could not shorten this IRI to a CURIE",
                ontology_name=ontology.id,
                node_curie_id=ontology_node_id,
                output_stream=sys.stderr)
            node_curie_id = ontology_node_id

    # Ensure that all CUI CURIE IDs use the "CUI:" prefix (part of fix for issue #565)
    if is_cui_id(node_curie_id
                 ) and get_prefix_from_curie_id(node_curie_id) != CUI_PREFIX:
        node_curie_id = CUI_PREFIX + ":" + get_local_id_from_curie_id(
            node_curie_id)

    return node_curie_id
Beispiel #2
0
def make_ontology_from_local_file(file_name: str):
    file_name_without_ext = os.path.splitext(file_name)[0]
    file_name_with_pickle_ext = file_name_without_ext + ".pickle"
    if not os.path.isfile(file_name_with_pickle_ext):
        # the ontology hsa not been saved as a pickle file, so we need to load it from a text file
        if not file_name.endswith('.json'):
            temp_file_name = tempfile.mkstemp(
                prefix=kg2_util.TEMP_FILE_PREFIX + '-')[1] + '.json'
            size = os.path.getsize(file_name)
            kg2_util.log_message(message="Reading ontology file: " +
                                 file_name + "; size: " +
                                 "{0:.2f}".format(size / 1024) + " KiB",
                                 ontology_name=None)
            cp = subprocess.run(
                ['owltools', file_name, '-o', '-f', 'json', temp_file_name])
            # robot commented out because it is giving a NullPointerException on umls_semantictypes.owl
            # Once robot no longer gives a NullPointerException, we can use it like this:
            #        cp = subprocess.run(['robot', 'convert', '--input', file_name, '--output', temp_file_name])
            if cp.stdout is not None:
                kg2_util.log_message(message="OWL convert result: " +
                                     cp.stdout,
                                     ontology_name=None,
                                     output_stream=sys.stdout)
            if cp.stderr is not None:
                kg2_util.log_message(message="OWL convert result: " +
                                     cp.stderr,
                                     ontology_name=None,
                                     output_stream=sys.stderr)
            assert cp.returncode == 0
            json_file = file_name_without_ext + ".json"
            shutil.move(temp_file_name, json_file)
        else:
            json_file = file_name
        size = os.path.getsize(json_file)
        kg2_util.log_message(message="Reading ontology JSON file: " +
                             json_file + "; size: " +
                             "{0:.2f}".format(size / 1024) + " KiB",
                             ontology_name=None)

        ont_return = ontobio.ontol_factory.OntologyFactory().create(
            json_file, ignore_cache=True)
    else:
        size = os.path.getsize(file_name_with_pickle_ext)
        kg2_util.log_message("Reading ontology file: " +
                             file_name_with_pickle_ext + "; size: " +
                             "{0:.2f}".format(size / 1024) + " KiB",
                             ontology_name=None)
        ont_return = pickle.load(open(file_name_with_pickle_ext, "rb"))
    return ont_return
Beispiel #3
0
def delete_ontobio_cache_json(file_name: str):
    file_name_hash = hashlib.sha256(file_name.encode()).hexdigest()
    temp_file_path = os.path.join("/tmp", file_name_hash)
    if os.path.exists(temp_file_path):
        try:
            kg2_util.log_message(message="Deleting ontobio JSON cache file: " +
                                 temp_file_path)
            os.remove(temp_file_path)
        except OSError as e:
            if e.errno == errno.ENOENT:
                kg2_util.log_message(
                    message="Error deleting ontobio JSON cache file: " +
                    temp_file_path)
            else:
                raise e
Beispiel #4
0
    kg_edges_file_names = args.kgFileNewEdges
    test_mode = args.test
    output_file_name = args.outputFile[0]
    kg = json.load(open(kg_file_name, 'r'))
    kg_orphan_edges = {'edges': []}
    for kg_edges_file_name in kg_edges_file_names:
        kg_orphan_edges_new = []
        ctr_edges_added = 0
        kg_edges_new = json.load(open(kg_edges_file_name, 'r'))
        nodes_dict = {node['id']: node for node in kg['nodes']}
        for rel_dict in kg_edges_new['edges']:
            subject_curie = rel_dict['subject']
            object_curie = rel_dict['object']
            if subject_curie in nodes_dict and object_curie in nodes_dict:
                ctr_edges_added += 1
                kg['edges'].append(rel_dict)
            else:
                kg_orphan_edges_new.append(rel_dict)
        kg_orphan_edges['edges'] += kg_orphan_edges_new
        kg2_util.log_message("number edges added: " + str(ctr_edges_added),
                             ontology_name=kg_edges_file_name,
                             output_stream=sys.stderr)
        kg2_util.log_message("number of orphan edges: " +
                             str(len(kg_orphan_edges['edges'])),
                             ontology_name=kg_edges_file_name,
                             output_stream=sys.stderr)
    kg2_util.save_json(kg, output_file_name, test_mode)
    kg_file_orphan_edges = args.kgFileOrphanEdges
    if kg_file_orphan_edges is not None:
        kg2_util.save_json(kg_orphan_edges, kg_file_orphan_edges, test_mode)
Beispiel #5
0
    arg_parser.add_argument('--outputFile', type=str, nargs='?', default=None)
    arg_parser.add_argument('kgFiles', type=str, nargs='+')
    return arg_parser


if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    kg_file_names = args.kgFiles
    test_mode = args.test
    output_file_name = args.outputFile
    kg_orphan_edges = {'nodes': [], 'edges': []}
    nodes = dict()
    rels = dict()
    for kg_file_name in kg_file_names:
        kg2_util.log_message("reading nodes from file",
                             ontology_name=kg_file_name,
                             output_stream=sys.stderr)
        kg_to_add = json.load(open(kg_file_name, 'r'))
        kg_to_add_nodes = kg_to_add['nodes']
        for node in kg_to_add_nodes:
            node_id = node['id']
            if node_id not in nodes:
                nodes[node_id] = node
            else:
                nodes[node_id] = kg2_util.merge_two_dicts(nodes[node_id], node)
        kg2_util.log_message("number of nodes added: " +
                             str(len(kg_to_add_nodes)),
                             ontology_name=kg_file_name,
                             output_stream=sys.stderr)
    ctr_edges_added = 0
    edges = []
def get_percent_decrease(num1, num2):
    return (num1 - num2) / num1


if __name__ == '__main__':
    args = get_args()
    previous_json = dict()
    current_json = dict()
    with open(args.previousFile, 'r') as previous_file:
        previous_json = json.load(previous_file)
    with open(args.currentFile, 'r') as current_file:
        current_json = json.load(current_file)
    previous_edge_sources = previous_json['number_of_edges_by_source']
    current_edge_sources = current_json['number_of_edges_by_source']

    for source in previous_edge_sources:
        edge_count = previous_edge_sources[source]
        current_edge_count = current_edge_sources.get(source, None)
        if current_edge_count == None or current_edge_count == 0:
            message = f"There are no edges from {source} in this build. \
                        There were {edge_count} in the previous build."
            kg2_util.log_message(message.replace('  ', ''),
                                 output_stream=sys.stderr)
            continue
        if get_percent_decrease(edge_count, current_edge_count) > 0.2:
            message = f"There was a significant drop in edges from {source} \
                        in this build. The count dropped from {edge_count} \
                        to {current_edge_count}"
            kg2_util.log_message(message.replace('  ', ''),
                                 output_stream=sys.stderr)
Beispiel #7
0
        assert 'uri' in node_dict
        iri = node_dict['uri']
        del node_dict['uri']
        assert 'id' in node_dict
        id = node_dict['id']
        curie_prefix = id.split(':')[0]
        provided_by = KG1_PROVIDED_BY_TO_KG2_PROVIDED_BY_CURIE_IDS.get(
            curie_prefix, None)
        if provided_by is None:
            raise Exception("unable to get provider for CURIE prefix: " +
                            curie_prefix)
        iri = expand(id)
        if iri is None:
            kg2_util.log_message(
                message='Invalid CURIE ID that cannot be expanded to an IRI',
                ontology_name=kg2_util.CURIE_PREFIX_RTX_KG1 + ':' + ';' +
                provided_by,
                node_curie_id=id,
                output_stream=sys.stderr)
        symbol = node_dict.get('symbol', None)
        synonym_list = []
        if symbol is not None:
            synonym_list.append(symbol)
            del node_dict['symbol']

        category_label = node_dict['category'].replace('_', ' ')
        if category_label == 'molecular function':
            category_label = kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY
        elif category_label == kg2_util.BIOLINK_CATEGORY_MICRORNA and id.startswith(
                kg2_util.CURIE_PREFIX_NCBI_GENE + ':'):
            category_label = kg2_util.BIOLINK_CATEGORY_GENE
            synonym_list.append('Biotype:microRNA')
Beispiel #8
0
def get_rels_dict(nodes: dict, owl_file_information_dict_list: list,
                  uri_to_curie_shortener: callable,
                  map_of_node_ontology_ids_to_curie_ids: dict):
    rels_dict = dict()

    for owl_file_information_dict in owl_file_information_dict_list:
        ontology = owl_file_information_dict['ontology']
        ontology_id = owl_file_information_dict['id']
        ont_graph = ontology.get_graph()
        ontology_curie_id = map_of_node_ontology_ids_to_curie_ids[ontology_id]
        for (object_id, subject_id,
             predicate_dict) in ont_graph.edges(data=True):
            assert type(predicate_dict) == dict

            ontology_node = nodes.get(ontology_curie_id, None)
            if ontology_node is not None:
                ontology_update_date = ontology_node['update date']

            if subject_id == OWL_BASE_CLASS or object_id == OWL_BASE_CLASS:
                continue

            if subject_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER) or \
               object_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
                continue

            # subject_id and object_id are IDs from the original ontology objects; these may not
            # always be the node curie IDs (e.g., for SNOMED terms). Need to map them
            subject_curie_id = map_of_node_ontology_ids_to_curie_ids.get(
                subject_id, None)
            if subject_curie_id is None:
                kg2_util.log_message(
                    message="ontology node ID has no curie ID in the map",
                    ontology_name=ontology.id,
                    node_curie_id=subject_id,
                    output_stream=sys.stderr)
                continue
            object_curie_id = map_of_node_ontology_ids_to_curie_ids.get(
                object_id, None)
            if object_curie_id is None:
                kg2_util.log_message(
                    message="ontology node ID has no curie ID in the map",
                    ontology_name=ontology.id,
                    node_curie_id=object_id,
                    output_stream=sys.stderr)
                continue

            predicate_label = None
            edge_pred_string = predicate_dict['pred']

            if subject_curie_id.startswith(
                    'TUI:') and object_curie_id.startswith(
                        'TUI:') and edge_pred_string == 'subClassOf':
                continue

            if not edge_pred_string.startswith(
                    'http:') and not edge_pred_string.startswith('https'):
                # edge_pred_string is not a URI; this is the most common case
                if ':' not in edge_pred_string:
                    # edge_pred_string is not a CURIE; this is the most common subcase
                    if edge_pred_string != 'subClassOf':
                        predicate_curie = 'owl:' + edge_pred_string
                    else:
                        predicate_curie = 'rdfs:subClassOf'
                    predicate_label = kg2_util.convert_camel_case_to_snake_case(
                        edge_pred_string)
                else:
                    # edge_pred_string is a CURIE
                    predicate_curie = edge_pred_string
                    predicate_node = nodes.get(predicate_curie, None)
                    if predicate_node is not None:
                        predicate_label = predicate_node['name']
                    else:
                        # predicate has no node object defined; just pull the label out of the CURIE
                        if edge_pred_string.startswith('OBO:'):
                            test_curie = edge_pred_string.replace('OBO:',
                                                                  '').replace(
                                                                      '_', ':')
                            predicate_node = nodes.get(test_curie, None)
                            if predicate_node is None:
                                predicate_label = edge_pred_string.split(
                                    ':')[1].split('#')[-1]
                            else:
                                predicate_curie = test_curie
                        else:
                            predicate_label = edge_pred_string
                predicate_iri = prefixcommons.expand_uri(predicate_curie)
                predicate_curie_new = uri_to_curie_shortener(predicate_iri)
                if predicate_curie_new is not None:
                    predicate_curie = predicate_curie_new
            else:
                predicate_iri = edge_pred_string
                predicate_curie = uri_to_curie_shortener(predicate_iri)

            if predicate_curie is None:
                kg2_util.log_message(message="predicate IRI has no CURIE: " +
                                     predicate_iri,
                                     ontology_name=ontology.id,
                                     output_stream=sys.stderr)
                continue

            if subject_curie_id == object_curie_id and predicate_label == 'xref':
                continue

            if predicate_curie == 'UMLS:hasSTY':
                subject_node = nodes[subject_curie_id]
                object_node = nodes[object_curie_id]
                subject_description = subject_node['description']
                if subject_description is None:
                    subject_description = ''
                subject_node['description'] = '; '.join(
                    list(
                        filter(None, [
                            subject_description,
                            'UMLS Semantic Type: ' + object_node['id']
                        ])))
                continue

            rel_key = make_rel_key(subject_curie_id, predicate_curie,
                                   object_curie_id, ontology_curie_id)

            if predicate_label is None and ':' in predicate_curie:
                pred_node = nodes.get(predicate_curie, None)
                if pred_node is not None:
                    predicate_label = pred_node['name']
                    if predicate_label[0].isupper():
                        predicate_label = predicate_label[0].lower(
                        ) + predicate_label[1:]

            assert predicate_label is not None
            predicate_label = predicate_label.replace(' ', '_')
            # Only tested on Food and Efo ontologies
            predicate_label = kg2_util.convert_camel_case_to_snake_case(
                predicate_label)
            if rels_dict.get(rel_key, None) is None:
                edge = kg2_util.make_edge(subject_curie_id, object_curie_id,
                                          predicate_iri, predicate_curie,
                                          predicate_label, ontology_id,
                                          ontology_update_date)
                rels_dict[rel_key] = edge
        for node_id, node_dict in nodes.items():
            xrefs = node_dict['xrefs']
            if xrefs is not None:
                for xref_node_id in xrefs:
                    if xref_node_id in nodes and node_id != xref_node_id:
                        provided_by = nodes[node_id]['provided by']
                        key = make_rel_key(node_id, CURIE_OBO_XREF,
                                           xref_node_id, provided_by)
                        if rels_dict.get(key, None) is None:
                            edge = kg2_util.make_edge(node_id, xref_node_id,
                                                      IRI_OBO_XREF,
                                                      CURIE_OBO_XREF, 'xref',
                                                      provided_by,
                                                      ontology_update_date)
                            rels_dict[key] = edge

    return rels_dict
Beispiel #9
0
def make_nodes_dict_from_ontologies_list(
        ontology_info_list: list, curies_to_categories: dict,
        uri_to_curie_shortener: callable,
        category_label_to_iri_mapper: callable):
    ret_dict = dict()
    ontologies_iris_to_curies = dict()

    for ontology_info_dict in ontology_info_list:
        ontology = ontology_info_dict['ontology']
        iri_of_ontology = ontology_info_dict['id']
        assert iri_of_ontology is not None

        ontology_curie_id = uri_to_curie_shortener(iri_of_ontology)
        if ontology_curie_id is None or len(ontology_curie_id) == 0:
            ontology_curie_id = iri_of_ontology
        umls_sver = ontology_info_dict.get('umls-sver', None)
        updated_date = None
        if umls_sver is not None:
            # if you can, parse sver string into a date string
            updated_date = parse_umls_sver_date(umls_sver)
        if updated_date is None:
            updated_date = ontology_info_dict['file last modified timestamp']

        ontology_node = kg2_util.make_node(ontology_curie_id, iri_of_ontology,
                                           ontology_info_dict['title'],
                                           'data source', updated_date,
                                           iri_of_ontology)
        ontology_node['description'] = ontology_info_dict['description']
        ontology_node['ontology node ids'] = [iri_of_ontology]
        ontology_node['xrefs'] = []
        ret_dict[ontology_curie_id] = ontology_node

        ontologies_iris_to_curies[iri_of_ontology] = ontology_curie_id

        for ontology_node_id in ontology.nodes():
            onto_node_dict = ontology.node(ontology_node_id)
            assert onto_node_dict is not None

            if ontology_node_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
                continue

            node_curie_id = get_node_curie_id_from_ontology_node_id(
                ontology_node_id, ontology, uri_to_curie_shortener)
            assert not node_curie_id.startswith('UMLS:C')  # :DEBUG:

            iri = onto_node_dict.get('id', None)
            if iri is None:
                iri = ontology_node_id

            # Ensure all CUI nodes use a 'umls/cui' IRI (part of fix for #565)
            if is_cui_id(node_curie_id):
                iri = CUI_BASE_IRI + '/' + get_local_id_from_curie_id(
                    node_curie_id)

            if not iri.startswith('http:') and not iri.startswith('https:'):
                iri = prefixcommons.expand_uri(iri)

            if node_curie_id.startswith(
                    'NCBIGene:') or node_curie_id.startswith('HGNC:'):
                iri = prefixcommons.expand_uri(node_curie_id)

            generated_iri = prefixcommons.expand_uri(node_curie_id)
            if generated_iri != node_curie_id:
                if (generated_iri.startswith('http:') or generated_iri.startswith('https:')) and \
                   generated_iri != iri:
                    iri = generated_iri

            node_name = onto_node_dict.get('label', None)
            node_full_name = None

            [node_category_label, ontology_id_of_node_with_category
             ] = get_biolink_category_for_node(ontology_node_id, node_curie_id,
                                               ontology, curies_to_categories,
                                               uri_to_curie_shortener, set(),
                                               False)

            node_deprecated = False
            node_description = None
            node_creation_date = None
            node_update_date = None
            node_replaced_by_curie = None
            node_full_name = None
            node_publications = set()
            node_synonyms = set()
            node_xrefs = set()
            node_tui = None
            node_has_cui = False
            node_tui_category_label = None

            node_meta = onto_node_dict.get('meta', None)
            if node_meta is not None:
                node_deprecated = node_meta.get('deprecated', False)
                node_definition = node_meta.get('definition', None)
                if node_definition is not None:
                    node_description = node_definition['val']
                    if node_description.startswith(
                            'OBSOLETE:') or node_description.startswith(
                                'Obsolete.'):
                        continue

                    node_definition_xrefs = node_definition.get('xrefs', None)
                    if node_definition_xrefs is not None:
                        assert type(node_definition_xrefs) == list
                        for xref in node_definition_xrefs:
                            xref_pub = xref_as_a_publication(xref)
                            if xref_pub is not None:
                                node_publications.add(xref_pub)

                node_synonyms_list = node_meta.get('synonyms', None)
                if node_synonyms_list is not None:
                    for syn_dict in node_synonyms_list:
                        syn_pred = syn_dict['pred']
                        if syn_pred == 'hasExactSynonym':
                            node_synonyms.add(syn_dict['val'])
                            syn_xrefs = syn_dict['xrefs']
                            if len(syn_xrefs) > 0:
                                for syn_xref in syn_xrefs:
                                    syn_xref_pub = xref_as_a_publication(
                                        syn_xref)
                                    if syn_xref_pub is not None:
                                        node_publications.add(syn_xref_pub)

                node_xrefs_list = node_meta.get('xrefs', None)
                if node_xrefs_list is not None:
                    for xref_dict in node_xrefs_list:
                        xref_curie = xref_dict['val']
                        if xref_curie.startswith('MESH:'):
                            xref_curie = xref_curie.replace('MESH:', 'MSH:')
                        elif xref_curie.startswith('UMLS:C'):
                            xref_curie = CUI_PREFIX + ':' + xref_curie.split(
                                'UMLS:')[1]
                        node_xrefs.add(xref_curie)
                basic_property_values = node_meta.get('basicPropertyValues',
                                                      None)
                if basic_property_values is not None:
                    node_tui_list = []
                    for basic_property_value_dict in basic_property_values:
                        bpv_pred = basic_property_value_dict['pred']
                        bpv_pred_curie = uri_to_curie_shortener(bpv_pred)
                        if bpv_pred_curie is None:
                            bpv_pred_curie = bpv_pred
                        bpv_val = basic_property_value_dict['val']
                        if bpv_pred_curie in [
                                'OIO:creation_date', 'dcterms:issued',
                                'HGNC:DATE_CREATED'
                        ]:
                            node_creation_date = bpv_val
                        elif bpv_pred_curie == 'HGNC:DATE_LAST_MODIFIED':
                            node_update_date = bpv_val
                        elif bpv_pred_curie == 'IAL:0100001':
                            assert node_deprecated
                            node_replaced_by_uri = bpv_val
                            node_replaced_by_curie = uri_to_curie_shortener(
                                node_replaced_by_uri)
                        elif bpv_pred_curie == 'UMLS:STY':  # STY_BASE_IRI:
                            node_tui_list.append(bpv_val)
                        elif bpv_pred_curie == 'skos:prefLabel':
                            if not node_curie_id.startswith('HGNC:'):
                                node_name = bpv_val
                            else:
                                node_full_name = bpv_val
                                if node_name is None:
                                    node_name = node_full_name
                        elif bpv_pred_curie == 'skos:altLabel':
                            node_synonyms.add(bpv_val)
                        elif bpv_pred_curie == 'skos:definition':
                            node_description = kg2_util.strip_html(bpv_val)
                        elif bpv_pred_curie == 'HGNC:GENESYMBOL':
                            node_name = bpv_val
                            node_synonyms.add(bpv_val)
                        elif bpv_pred_curie == 'UMLS:cui':
                            node_has_cui = True
                    if len(node_tui_list) == 1:
                        node_tui = node_tui_list[0]
                        node_tui_uri = posixpath.join(
                            'https://identifiers.org/umls/STY', node_tui)
                        node_tui_curie = uri_to_curie_shortener(node_tui_uri)
                        assert node_tui_curie is not None
                        [node_tui_category_label,
                         _] = get_biolink_category_for_node(
                             node_tui_uri, node_tui_curie, ontology,
                             curies_to_categories, uri_to_curie_shortener,
                             set(), True)

                node_comments = node_meta.get('comments', None)
                if node_comments is not None:
                    comments_str = 'COMMENTS: ' + (' // '.join(node_comments))
                    if node_description is not None:
                        node_description += ' // ' + comments_str
                    else:
                        node_description = comments_str

            if node_category_label is None:
                node_type = onto_node_dict.get('type', None)
                if node_type is not None and node_type == 'PROPERTY':
                    node_category_label = 'property'

            if node_category_label is None:
                if not node_deprecated:
                    kg2_util.log_message("Node does not have a category",
                                         ontology.id,
                                         node_curie_id,
                                         output_stream=sys.stderr)
                    node_category_label = 'unknown category'
                else:
                    node_category_label = 'deprecated node'

            if node_has_cui:
                assert node_tui is not None or len(node_tui_list) > 0

                if node_tui_category_label is None:
                    node_tui_category_label = 'unknown category'
                    if node_tui is not None:
                        kg2_util.log_message(
                            message='Node ' + ontology_node_id +
                            ' has CUI whose TUI cannot be mapped to category: '
                            + node_tui)
                    else:
                        kg2_util.log_message(
                            message='Node ' + ontology_node_id +
                            ' has CUI with multiple associated TUIs: ' +
                            ', '.join(node_tui_list))
                else:
                    if node_category_label is None:
                        node_category_label = node_tui_category_label  # override the node category label if we have a TUI
                node_tui_category_iri = category_label_to_iri_mapper(
                    node_tui_category_label)
            ontology_curie_id = ontologies_iris_to_curies[iri_of_ontology]
            source_ontology_information = ret_dict.get(ontology_curie_id, None)
            if source_ontology_information is None:
                kg2_util.log_message(
                    message=
                    "ontology IRI has no information dictionary available",
                    ontology_name=iri_of_ontology,
                    output_stream=sys.stderr)
                assert False
            source_ontology_update_date = source_ontology_information[
                'update date']
            if node_update_date is None:
                node_update_date = source_ontology_update_date

            if node_description is not None:
                node_description_xrefs_match = REGEX_XREF_END_DESCRIP.match(
                    node_description)
                if node_description_xrefs_match is not None:
                    node_description_xrefs_str = node_description_xrefs_match[
                        1]
                    node_description_xrefs_list = node_description_xrefs_str.split(
                        ',')
                    for node_description_xref_str in node_description_xrefs_list:
                        node_description_xref_str = node_description_xref_str.strip(
                        )
                        if ':' in node_description_xref_str:
                            node_xrefs.add(node_description_xref_str)
                node_description_pubs = REGEX_PUBLICATIONS.findall(
                    node_description)
                for pub_curie in node_description_pubs:
                    node_publications.add(pub_curie)

            # deal with node names that are ALLCAPS
            if node_name is not None and node_name.isupper():
                node_name = kg2_util.allcaps_to_only_first_letter_capitalized(
                    node_name)

            node_dict = kg2_util.make_node(node_curie_id, iri, node_name,
                                           node_category_label,
                                           node_update_date, iri_of_ontology)
            node_dict['full name'] = node_full_name
            node_dict['description'] = node_description
            node_dict[
                'creation date'] = node_creation_date  # slot name is not biolink standard
            node_dict[
                'deprecated'] = node_deprecated  # slot name is not biolink standard
            node_dict[
                'replaced by'] = node_replaced_by_curie  # slot name is not biolink standard
            node_dict['ontology node ids'] = [
                ontology_node_id
            ]  # slot name is not biolink standard
            node_dict['xrefs'] = list(
                node_xrefs)  # slot name is not biolink standard
            node_dict['synonym'] = list(
                node_synonyms)  # slot name is not biolink standard
            node_dict['publications'] = list(node_publications)

            # check if we need to make a CUI node
            if node_meta is not None and basic_property_values is not None:
                for basic_property_value_dict in basic_property_values:
                    bpv_pred = basic_property_value_dict['pred']
                    bpv_pred_curie = uri_to_curie_shortener(bpv_pred)
                    bpv_val = basic_property_value_dict['val']
                    if bpv_pred_curie == 'UMLS:cui':  # CUI_BASE_IRI:
                        cui_node_dict = dict(node_dict)
                        cui_uri = bpv_pred + '/' + bpv_val
                        cui_curie = uri_to_curie_shortener(cui_uri)
                        assert cui_curie is not None
                        assert not cui_curie.startswith('UMLS:C')  # :DEBUG:
                        # Skip this CUI if it's identical to the ontology node itself (happens with files created
                        # using 'load_on_cuis' - part of fix for issue #565)
                        if get_local_id_from_curie_id(
                                cui_curie) == get_local_id_from_curie_id(
                                    node_curie_id):
                            continue
                        cui_node_dict['id'] = cui_curie
                        cui_node_dict['iri'] = cui_uri
                        cui_node_dict['synonym'] = []
                        cui_node_dict['category'] = node_tui_category_iri
                        cui_node_dict[
                            'category label'] = node_tui_category_label.replace(
                                ' ', '_')
                        cui_node_dict['ontology node ids'] = []
                        cui_node_dict['provided by'] = CUI_BASE_IRI
                        cui_node_dict['xrefs'] = [
                        ]  # blanking the "xrefs" here is *vital* in order to avoid issue #395
                        cui_node_dict_existing = ret_dict.get(cui_curie, None)
                        if cui_node_dict_existing is not None:
                            cui_node_dict = kg2_util.merge_two_dicts(
                                cui_node_dict, cui_node_dict_existing)
                        ret_dict[cui_curie] = cui_node_dict
                        node_dict_xrefs = node_dict['xrefs']
                        node_dict_xrefs.append(cui_curie)
                        node_dict['xrefs'] = list(set(node_dict_xrefs))
                    elif bpv_pred_curie == 'HGNC:ENTREZGENE_ID':
                        entrez_gene_id = bpv_val
                        entrez_node_dict = dict(node_dict)
                        entrez_curie = 'NCBIGene:' + entrez_gene_id
                        entrez_node_dict['id'] = entrez_curie
                        entrez_node_dict[
                            'iri'] = 'https://identifiers.org/NCBIGene/' + entrez_gene_id
                        ret_dict[entrez_curie] = entrez_node_dict
                        node_dict_xrefs = node_dict['xrefs']
                        node_dict_xrefs.append(entrez_curie)
                        node_dict['xrefs'] = list(set(node_dict_xrefs))
            if node_curie_id in ret_dict:
                node_dict = kg2_util.merge_two_dicts(ret_dict[node_curie_id],
                                                     node_dict)
            ret_dict[node_curie_id] = node_dict
    return ret_dict
Beispiel #10
0
def get_biolink_category_for_node(ontology_node_id: str, node_curie_id: str,
                                  ontology: ontobio.ontol.Ontology,
                                  curies_to_categories: dict,
                                  uri_to_curie_shortener: callable,
                                  ontology_node_ids_previously_seen: set,
                                  get_node_id_of_node_with_category: bool):

    # if we have already looked for a category for this node, return None
    if ontology_node_id in ontology_node_ids_previously_seen:
        return [None, None]

    if ontology_node_id == OWL_NOTHING or node_curie_id is None:
        return [None, None]

    ontology_node_ids_previously_seen.add(ontology_node_id)

    curie_prefix = get_prefix_from_curie_id(node_curie_id)

    # Inelegant hack to ensure that TUI: nodes get mapped to "semantic type" while still enabling us
    # to use get_biolink_category_for_node to determine the specific semantic type of a CUI: based on its
    # TUI: record. Need to think about a more elegant way to do this. [SAR]
    if curie_prefix == 'TUI' and ontology.id.endswith('/umls/STY/'):
        return ['semantic type', None]

    if get_node_id_of_node_with_category:
        ret_ontology_node_id_of_node_with_category = ontology_node_id
    else:
        ret_ontology_node_id_of_node_with_category = None

    curies_to_categories_prefixes = curies_to_categories['prefix-mappings']
    ret_category = curies_to_categories_prefixes.get(curie_prefix, None)
    if ret_category is None:
        # need to walk the ontology hierarchy until we encounter a parent term with a defined biolink category
        curies_to_categories_terms = curies_to_categories['term-mappings']
        ret_category = curies_to_categories_terms.get(node_curie_id, None)
        if ret_category is None:
            for parent_ontology_node_id in ontology.parents(
                    ontology_node_id, ['subClassOf']):
                parent_node_curie_id = get_node_curie_id_from_ontology_node_id(
                    parent_ontology_node_id, ontology, uri_to_curie_shortener)
                try:
                    [ret_category, ontology_node_id_of_node_with_category
                     ] = get_biolink_category_for_node(
                         parent_ontology_node_id, parent_node_curie_id,
                         ontology, curies_to_categories,
                         uri_to_curie_shortener,
                         ontology_node_ids_previously_seen,
                         get_node_id_of_node_with_category)
                    if get_node_id_of_node_with_category and ontology_node_id_of_node_with_category is not None:
                        ret_ontology_node_id_of_node_with_category = ontology_node_id_of_node_with_category
                except RecursionError:
                    kg2_util.log_message(message="recursion error: " +
                                         ontology_node_id,
                                         ontology_name=ontology.id,
                                         node_curie_id=node_curie_id,
                                         output_stream=sys.stderr)
                    assert False
                if ret_category is not None:
                    break
    if ret_category is None:
        if node_curie_id.startswith(kg2_util.CURIE_PREFIX_ENSEMBL + ':'):
            curie_suffix = node_curie_id.replace(
                kg2_util.CURIE_PREFIX_ENSEMBL + ':', '')
            ensembl_match = REGEX_ENSEMBL.match(curie_suffix)
            if ensembl_match is not None:
                ensembl_match_letter = ensembl_match[1]
                ret_category = ENSEMBL_LETTER_TO_CATEGORY.get(
                    ensembl_match_letter, None)
                if ret_category is None:
                    kg2_util.log_message(message="unrecognized Ensembl ID: " +
                                         curie_suffix,
                                         ontology_name=ontology.id,
                                         node_curie_id=node_curie_id,
                                         output_stream=sys.stderr)

    return [ret_category, ret_ontology_node_id_of_node_with_category]
Beispiel #11
0
def make_kg2(curies_to_categories: dict,
             uri_to_curie_shortener: callable,
             map_category_label_to_iri: callable,
             owl_urls_and_files: tuple,
             output_file_name: str,
             test_mode: bool = False):

    owl_file_information_dict_list = []

    # for each OWL file (or URL for an OWL file) described in the YAML config file...
    for ont_source_info_dict in owl_urls_and_files:
        if ont_source_info_dict['download']:
            # get the OWL file onto the local file system and get a full path to it
            local_file_name = kg2_util.download_file_if_not_exist_locally(
                ont_source_info_dict['url'], ont_source_info_dict['file'])
        else:
            local_file_name = ont_source_info_dict['file']
            assert os.path.exists(ont_source_info_dict['file'])
        # load the OWL file dadta into an ontobio.ontol.Ontology data structure and information dictionary
        [ont, metadata_dict] = load_owl_file_return_ontology_and_metadata(
            local_file_name, ont_source_info_dict['url'],
            ont_source_info_dict['title'])
        metadata_dict['ontology'] = ont
        owl_file_information_dict_list.append(metadata_dict)

    kg2_util.log_message('Calling make_nodes_dict_from_ontologies_list')

    nodes_dict = make_nodes_dict_from_ontologies_list(
        owl_file_information_dict_list, curies_to_categories,
        uri_to_curie_shortener, map_category_label_to_iri)

    kg2_util.log_message('Calling make_map_of_node_ontology_ids_to_curie_ids')

    map_of_node_ontology_ids_to_curie_ids = make_map_of_node_ontology_ids_to_curie_ids(
        nodes_dict)

    kg2_util.log_message('Calling get_rels_dict')

    # get a dictionary of all relationships including xrefs as relationships
    all_rels_dict = get_rels_dict(nodes_dict, owl_file_information_dict_list,
                                  uri_to_curie_shortener,
                                  map_of_node_ontology_ids_to_curie_ids)

    kg2_dict = dict()
    kg2_dict['edges'] = [rel_dict for rel_dict in all_rels_dict.values()]
    kg2_util.log_message('Number of edges: ' + str(len(kg2_dict['edges'])))
    kg2_dict['nodes'] = list(nodes_dict.values())
    kg2_util.log_message('Number of nodes: ' + str(len(kg2_dict['nodes'])))
    del nodes_dict

    # delete xrefs from all_nodes_dict
    for node_dict in kg2_dict['nodes']:
        del node_dict['xrefs']
        del node_dict['ontology node ids']

    kg2_util.log_message('Saving JSON file')
    kg2_util.save_json(kg2_dict, output_file_name, test_mode)