Beispiel #1
0
        node_dict['publications'] = []
        node_dict['update_date'] = None
        node_dict['creation_date'] = None
        node_dict['deprecated'] = False
        node_dict['replaced_by'] = None
        node_dict['provided_by'] = provided_by
#    pprint.pprint(nodes_list)
    query_statement = "MATCH (n)-[r]->(m) RETURN n.id, r, m.id"
    if test_mode:
        query_statement += " limit 10000"
    edges_result = query_neo4j(neo4j_auth, neo4j_endpoint_uri, query_statement,
                               test_mode)
    edges_list = [
        kg2_util.merge_two_dicts(
            {
                'subject': result_item_list[0],
                'object': result_item_list[2]
            }, result_item_list[1]['data'])
        for result_item_list in edges_result
    ]
    for edge_dict in edges_list:
        del edge_dict['is_defined_by']
        del edge_dict['seed_node_uuid']
        del edge_dict['source_node_uuid']
        del edge_dict['target_node_uuid']
        predicate_label = edge_dict['relation']
        edge_dict['edge_label'] = predicate_label
        del edge_dict['relation']
        relation_curie = kg2_util.predicate_label_to_curie(
            predicate_label, KG1_RELATION_CURIE_PREFIX)
        if relation_curie == 'bioLink:subclass_of':
Beispiel #2
0
 output_file_name = args.outputFile
 kg_orphan_edges = {'nodes': [], 'edges': []}
 nodes = dict()
 rels = dict()
 for kg_file_name in kg_file_names:
     kg2_util.log_message("reading nodes from file",
                          ontology_name=kg_file_name,
                          output_stream=sys.stderr)
     kg_to_add = json.load(open(kg_file_name, 'r'))
     kg_to_add_nodes = kg_to_add['nodes']
     for node in kg_to_add_nodes:
         node_id = node['id']
         if node_id not in nodes:
             nodes[node_id] = node
         else:
             nodes[node_id] = kg2_util.merge_two_dicts(nodes[node_id], node)
     kg2_util.log_message("number of nodes added: " +
                          str(len(kg_to_add_nodes)),
                          ontology_name=kg_file_name,
                          output_stream=sys.stderr)
 ctr_edges_added = 0
 edges = []
 last_edges_added = 0
 last_orphan_edges = 0
 edge_keys = set()
 for kg_file_name in kg_file_names:
     kg_orphan_edges_new = []
     kg2_util.log_message("reading edges from file",
                          ontology_name=kg_file_name,
                          output_stream=sys.stderr)
     kg_to_add = json.load(open(kg_file_name, 'r'))
Beispiel #3
0
def make_nodes_dict_from_ontologies_list(
        ontology_info_list: list, curies_to_categories: dict,
        uri_to_curie_shortener: callable,
        category_label_to_iri_mapper: callable):
    ret_dict = dict()
    ontologies_iris_to_curies = dict()

    for ontology_info_dict in ontology_info_list:
        ontology = ontology_info_dict['ontology']
        iri_of_ontology = ontology_info_dict['id']
        assert iri_of_ontology is not None

        ontology_curie_id = uri_to_curie_shortener(iri_of_ontology)
        if ontology_curie_id is None or len(ontology_curie_id) == 0:
            ontology_curie_id = iri_of_ontology
        umls_sver = ontology_info_dict.get('umls-sver', None)
        updated_date = None
        if umls_sver is not None:
            # if you can, parse sver string into a date string
            updated_date = parse_umls_sver_date(umls_sver)
        if updated_date is None:
            updated_date = ontology_info_dict['file last modified timestamp']

        ontology_node = kg2_util.make_node(ontology_curie_id, iri_of_ontology,
                                           ontology_info_dict['title'],
                                           'data source', updated_date,
                                           iri_of_ontology)
        ontology_node['description'] = ontology_info_dict['description']
        ontology_node['ontology node ids'] = [iri_of_ontology]
        ontology_node['xrefs'] = []
        ret_dict[ontology_curie_id] = ontology_node

        ontologies_iris_to_curies[iri_of_ontology] = ontology_curie_id

        for ontology_node_id in ontology.nodes():
            onto_node_dict = ontology.node(ontology_node_id)
            assert onto_node_dict is not None

            if ontology_node_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
                continue

            node_curie_id = get_node_curie_id_from_ontology_node_id(
                ontology_node_id, ontology, uri_to_curie_shortener)
            assert not node_curie_id.startswith('UMLS:C')  # :DEBUG:

            iri = onto_node_dict.get('id', None)
            if iri is None:
                iri = ontology_node_id

            # Ensure all CUI nodes use a 'umls/cui' IRI (part of fix for #565)
            if is_cui_id(node_curie_id):
                iri = CUI_BASE_IRI + '/' + get_local_id_from_curie_id(
                    node_curie_id)

            if not iri.startswith('http:') and not iri.startswith('https:'):
                iri = prefixcommons.expand_uri(iri)

            if node_curie_id.startswith(
                    'NCBIGene:') or node_curie_id.startswith('HGNC:'):
                iri = prefixcommons.expand_uri(node_curie_id)

            generated_iri = prefixcommons.expand_uri(node_curie_id)
            if generated_iri != node_curie_id:
                if (generated_iri.startswith('http:') or generated_iri.startswith('https:')) and \
                   generated_iri != iri:
                    iri = generated_iri

            node_name = onto_node_dict.get('label', None)
            node_full_name = None

            [node_category_label, ontology_id_of_node_with_category
             ] = get_biolink_category_for_node(ontology_node_id, node_curie_id,
                                               ontology, curies_to_categories,
                                               uri_to_curie_shortener, set(),
                                               False)

            node_deprecated = False
            node_description = None
            node_creation_date = None
            node_update_date = None
            node_replaced_by_curie = None
            node_full_name = None
            node_publications = set()
            node_synonyms = set()
            node_xrefs = set()
            node_tui = None
            node_has_cui = False
            node_tui_category_label = None

            node_meta = onto_node_dict.get('meta', None)
            if node_meta is not None:
                node_deprecated = node_meta.get('deprecated', False)
                node_definition = node_meta.get('definition', None)
                if node_definition is not None:
                    node_description = node_definition['val']
                    if node_description.startswith(
                            'OBSOLETE:') or node_description.startswith(
                                'Obsolete.'):
                        continue

                    node_definition_xrefs = node_definition.get('xrefs', None)
                    if node_definition_xrefs is not None:
                        assert type(node_definition_xrefs) == list
                        for xref in node_definition_xrefs:
                            xref_pub = xref_as_a_publication(xref)
                            if xref_pub is not None:
                                node_publications.add(xref_pub)

                node_synonyms_list = node_meta.get('synonyms', None)
                if node_synonyms_list is not None:
                    for syn_dict in node_synonyms_list:
                        syn_pred = syn_dict['pred']
                        if syn_pred == 'hasExactSynonym':
                            node_synonyms.add(syn_dict['val'])
                            syn_xrefs = syn_dict['xrefs']
                            if len(syn_xrefs) > 0:
                                for syn_xref in syn_xrefs:
                                    syn_xref_pub = xref_as_a_publication(
                                        syn_xref)
                                    if syn_xref_pub is not None:
                                        node_publications.add(syn_xref_pub)

                node_xrefs_list = node_meta.get('xrefs', None)
                if node_xrefs_list is not None:
                    for xref_dict in node_xrefs_list:
                        xref_curie = xref_dict['val']
                        if xref_curie.startswith('MESH:'):
                            xref_curie = xref_curie.replace('MESH:', 'MSH:')
                        elif xref_curie.startswith('UMLS:C'):
                            xref_curie = CUI_PREFIX + ':' + xref_curie.split(
                                'UMLS:')[1]
                        node_xrefs.add(xref_curie)
                basic_property_values = node_meta.get('basicPropertyValues',
                                                      None)
                if basic_property_values is not None:
                    node_tui_list = []
                    for basic_property_value_dict in basic_property_values:
                        bpv_pred = basic_property_value_dict['pred']
                        bpv_pred_curie = uri_to_curie_shortener(bpv_pred)
                        if bpv_pred_curie is None:
                            bpv_pred_curie = bpv_pred
                        bpv_val = basic_property_value_dict['val']
                        if bpv_pred_curie in [
                                'OIO:creation_date', 'dcterms:issued',
                                'HGNC:DATE_CREATED'
                        ]:
                            node_creation_date = bpv_val
                        elif bpv_pred_curie == 'HGNC:DATE_LAST_MODIFIED':
                            node_update_date = bpv_val
                        elif bpv_pred_curie == 'IAL:0100001':
                            assert node_deprecated
                            node_replaced_by_uri = bpv_val
                            node_replaced_by_curie = uri_to_curie_shortener(
                                node_replaced_by_uri)
                        elif bpv_pred_curie == 'UMLS:STY':  # STY_BASE_IRI:
                            node_tui_list.append(bpv_val)
                        elif bpv_pred_curie == 'skos:prefLabel':
                            if not node_curie_id.startswith('HGNC:'):
                                node_name = bpv_val
                            else:
                                node_full_name = bpv_val
                                if node_name is None:
                                    node_name = node_full_name
                        elif bpv_pred_curie == 'skos:altLabel':
                            node_synonyms.add(bpv_val)
                        elif bpv_pred_curie == 'skos:definition':
                            node_description = kg2_util.strip_html(bpv_val)
                        elif bpv_pred_curie == 'HGNC:GENESYMBOL':
                            node_name = bpv_val
                            node_synonyms.add(bpv_val)
                        elif bpv_pred_curie == 'UMLS:cui':
                            node_has_cui = True
                    if len(node_tui_list) == 1:
                        node_tui = node_tui_list[0]
                        node_tui_uri = posixpath.join(
                            'https://identifiers.org/umls/STY', node_tui)
                        node_tui_curie = uri_to_curie_shortener(node_tui_uri)
                        assert node_tui_curie is not None
                        [node_tui_category_label,
                         _] = get_biolink_category_for_node(
                             node_tui_uri, node_tui_curie, ontology,
                             curies_to_categories, uri_to_curie_shortener,
                             set(), True)

                node_comments = node_meta.get('comments', None)
                if node_comments is not None:
                    comments_str = 'COMMENTS: ' + (' // '.join(node_comments))
                    if node_description is not None:
                        node_description += ' // ' + comments_str
                    else:
                        node_description = comments_str

            if node_category_label is None:
                node_type = onto_node_dict.get('type', None)
                if node_type is not None and node_type == 'PROPERTY':
                    node_category_label = 'property'

            if node_category_label is None:
                if not node_deprecated:
                    kg2_util.log_message("Node does not have a category",
                                         ontology.id,
                                         node_curie_id,
                                         output_stream=sys.stderr)
                    node_category_label = 'unknown category'
                else:
                    node_category_label = 'deprecated node'

            if node_has_cui:
                assert node_tui is not None or len(node_tui_list) > 0

                if node_tui_category_label is None:
                    node_tui_category_label = 'unknown category'
                    if node_tui is not None:
                        kg2_util.log_message(
                            message='Node ' + ontology_node_id +
                            ' has CUI whose TUI cannot be mapped to category: '
                            + node_tui)
                    else:
                        kg2_util.log_message(
                            message='Node ' + ontology_node_id +
                            ' has CUI with multiple associated TUIs: ' +
                            ', '.join(node_tui_list))
                else:
                    if node_category_label is None:
                        node_category_label = node_tui_category_label  # override the node category label if we have a TUI
                node_tui_category_iri = category_label_to_iri_mapper(
                    node_tui_category_label)
            ontology_curie_id = ontologies_iris_to_curies[iri_of_ontology]
            source_ontology_information = ret_dict.get(ontology_curie_id, None)
            if source_ontology_information is None:
                kg2_util.log_message(
                    message=
                    "ontology IRI has no information dictionary available",
                    ontology_name=iri_of_ontology,
                    output_stream=sys.stderr)
                assert False
            source_ontology_update_date = source_ontology_information[
                'update date']
            if node_update_date is None:
                node_update_date = source_ontology_update_date

            if node_description is not None:
                node_description_xrefs_match = REGEX_XREF_END_DESCRIP.match(
                    node_description)
                if node_description_xrefs_match is not None:
                    node_description_xrefs_str = node_description_xrefs_match[
                        1]
                    node_description_xrefs_list = node_description_xrefs_str.split(
                        ',')
                    for node_description_xref_str in node_description_xrefs_list:
                        node_description_xref_str = node_description_xref_str.strip(
                        )
                        if ':' in node_description_xref_str:
                            node_xrefs.add(node_description_xref_str)
                node_description_pubs = REGEX_PUBLICATIONS.findall(
                    node_description)
                for pub_curie in node_description_pubs:
                    node_publications.add(pub_curie)

            # deal with node names that are ALLCAPS
            if node_name is not None and node_name.isupper():
                node_name = kg2_util.allcaps_to_only_first_letter_capitalized(
                    node_name)

            node_dict = kg2_util.make_node(node_curie_id, iri, node_name,
                                           node_category_label,
                                           node_update_date, iri_of_ontology)
            node_dict['full name'] = node_full_name
            node_dict['description'] = node_description
            node_dict[
                'creation date'] = node_creation_date  # slot name is not biolink standard
            node_dict[
                'deprecated'] = node_deprecated  # slot name is not biolink standard
            node_dict[
                'replaced by'] = node_replaced_by_curie  # slot name is not biolink standard
            node_dict['ontology node ids'] = [
                ontology_node_id
            ]  # slot name is not biolink standard
            node_dict['xrefs'] = list(
                node_xrefs)  # slot name is not biolink standard
            node_dict['synonym'] = list(
                node_synonyms)  # slot name is not biolink standard
            node_dict['publications'] = list(node_publications)

            # check if we need to make a CUI node
            if node_meta is not None and basic_property_values is not None:
                for basic_property_value_dict in basic_property_values:
                    bpv_pred = basic_property_value_dict['pred']
                    bpv_pred_curie = uri_to_curie_shortener(bpv_pred)
                    bpv_val = basic_property_value_dict['val']
                    if bpv_pred_curie == 'UMLS:cui':  # CUI_BASE_IRI:
                        cui_node_dict = dict(node_dict)
                        cui_uri = bpv_pred + '/' + bpv_val
                        cui_curie = uri_to_curie_shortener(cui_uri)
                        assert cui_curie is not None
                        assert not cui_curie.startswith('UMLS:C')  # :DEBUG:
                        # Skip this CUI if it's identical to the ontology node itself (happens with files created
                        # using 'load_on_cuis' - part of fix for issue #565)
                        if get_local_id_from_curie_id(
                                cui_curie) == get_local_id_from_curie_id(
                                    node_curie_id):
                            continue
                        cui_node_dict['id'] = cui_curie
                        cui_node_dict['iri'] = cui_uri
                        cui_node_dict['synonym'] = []
                        cui_node_dict['category'] = node_tui_category_iri
                        cui_node_dict[
                            'category label'] = node_tui_category_label.replace(
                                ' ', '_')
                        cui_node_dict['ontology node ids'] = []
                        cui_node_dict['provided by'] = CUI_BASE_IRI
                        cui_node_dict['xrefs'] = [
                        ]  # blanking the "xrefs" here is *vital* in order to avoid issue #395
                        cui_node_dict_existing = ret_dict.get(cui_curie, None)
                        if cui_node_dict_existing is not None:
                            cui_node_dict = kg2_util.merge_two_dicts(
                                cui_node_dict, cui_node_dict_existing)
                        ret_dict[cui_curie] = cui_node_dict
                        node_dict_xrefs = node_dict['xrefs']
                        node_dict_xrefs.append(cui_curie)
                        node_dict['xrefs'] = list(set(node_dict_xrefs))
                    elif bpv_pred_curie == 'HGNC:ENTREZGENE_ID':
                        entrez_gene_id = bpv_val
                        entrez_node_dict = dict(node_dict)
                        entrez_curie = 'NCBIGene:' + entrez_gene_id
                        entrez_node_dict['id'] = entrez_curie
                        entrez_node_dict[
                            'iri'] = 'https://identifiers.org/NCBIGene/' + entrez_gene_id
                        ret_dict[entrez_curie] = entrez_node_dict
                        node_dict_xrefs = node_dict['xrefs']
                        node_dict_xrefs.append(entrez_curie)
                        node_dict['xrefs'] = list(set(node_dict_xrefs))
            if node_curie_id in ret_dict:
                node_dict = kg2_util.merge_two_dicts(ret_dict[node_curie_id],
                                                     node_dict)
            ret_dict[node_curie_id] = node_dict
    return ret_dict