node_dict['publications'] = [] node_dict['update_date'] = None node_dict['creation_date'] = None node_dict['deprecated'] = False node_dict['replaced_by'] = None node_dict['provided_by'] = provided_by # pprint.pprint(nodes_list) query_statement = "MATCH (n)-[r]->(m) RETURN n.id, r, m.id" if test_mode: query_statement += " limit 10000" edges_result = query_neo4j(neo4j_auth, neo4j_endpoint_uri, query_statement, test_mode) edges_list = [ kg2_util.merge_two_dicts( { 'subject': result_item_list[0], 'object': result_item_list[2] }, result_item_list[1]['data']) for result_item_list in edges_result ] for edge_dict in edges_list: del edge_dict['is_defined_by'] del edge_dict['seed_node_uuid'] del edge_dict['source_node_uuid'] del edge_dict['target_node_uuid'] predicate_label = edge_dict['relation'] edge_dict['edge_label'] = predicate_label del edge_dict['relation'] relation_curie = kg2_util.predicate_label_to_curie( predicate_label, KG1_RELATION_CURIE_PREFIX) if relation_curie == 'bioLink:subclass_of':
output_file_name = args.outputFile kg_orphan_edges = {'nodes': [], 'edges': []} nodes = dict() rels = dict() for kg_file_name in kg_file_names: kg2_util.log_message("reading nodes from file", ontology_name=kg_file_name, output_stream=sys.stderr) kg_to_add = json.load(open(kg_file_name, 'r')) kg_to_add_nodes = kg_to_add['nodes'] for node in kg_to_add_nodes: node_id = node['id'] if node_id not in nodes: nodes[node_id] = node else: nodes[node_id] = kg2_util.merge_two_dicts(nodes[node_id], node) kg2_util.log_message("number of nodes added: " + str(len(kg_to_add_nodes)), ontology_name=kg_file_name, output_stream=sys.stderr) ctr_edges_added = 0 edges = [] last_edges_added = 0 last_orphan_edges = 0 edge_keys = set() for kg_file_name in kg_file_names: kg_orphan_edges_new = [] kg2_util.log_message("reading edges from file", ontology_name=kg_file_name, output_stream=sys.stderr) kg_to_add = json.load(open(kg_file_name, 'r'))
def make_nodes_dict_from_ontologies_list( ontology_info_list: list, curies_to_categories: dict, uri_to_curie_shortener: callable, category_label_to_iri_mapper: callable): ret_dict = dict() ontologies_iris_to_curies = dict() for ontology_info_dict in ontology_info_list: ontology = ontology_info_dict['ontology'] iri_of_ontology = ontology_info_dict['id'] assert iri_of_ontology is not None ontology_curie_id = uri_to_curie_shortener(iri_of_ontology) if ontology_curie_id is None or len(ontology_curie_id) == 0: ontology_curie_id = iri_of_ontology umls_sver = ontology_info_dict.get('umls-sver', None) updated_date = None if umls_sver is not None: # if you can, parse sver string into a date string updated_date = parse_umls_sver_date(umls_sver) if updated_date is None: updated_date = ontology_info_dict['file last modified timestamp'] ontology_node = kg2_util.make_node(ontology_curie_id, iri_of_ontology, ontology_info_dict['title'], 'data source', updated_date, iri_of_ontology) ontology_node['description'] = ontology_info_dict['description'] ontology_node['ontology node ids'] = [iri_of_ontology] ontology_node['xrefs'] = [] ret_dict[ontology_curie_id] = ontology_node ontologies_iris_to_curies[iri_of_ontology] = ontology_curie_id for ontology_node_id in ontology.nodes(): onto_node_dict = ontology.node(ontology_node_id) assert onto_node_dict is not None if ontology_node_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER): continue node_curie_id = get_node_curie_id_from_ontology_node_id( ontology_node_id, ontology, uri_to_curie_shortener) assert not node_curie_id.startswith('UMLS:C') # :DEBUG: iri = onto_node_dict.get('id', None) if iri is None: iri = ontology_node_id # Ensure all CUI nodes use a 'umls/cui' IRI (part of fix for #565) if is_cui_id(node_curie_id): iri = CUI_BASE_IRI + '/' + get_local_id_from_curie_id( node_curie_id) if not iri.startswith('http:') and not iri.startswith('https:'): iri = prefixcommons.expand_uri(iri) if node_curie_id.startswith( 'NCBIGene:') or node_curie_id.startswith('HGNC:'): iri = prefixcommons.expand_uri(node_curie_id) generated_iri = prefixcommons.expand_uri(node_curie_id) if generated_iri != node_curie_id: if (generated_iri.startswith('http:') or generated_iri.startswith('https:')) and \ generated_iri != iri: iri = generated_iri node_name = onto_node_dict.get('label', None) node_full_name = None [node_category_label, ontology_id_of_node_with_category ] = get_biolink_category_for_node(ontology_node_id, node_curie_id, ontology, curies_to_categories, uri_to_curie_shortener, set(), False) node_deprecated = False node_description = None node_creation_date = None node_update_date = None node_replaced_by_curie = None node_full_name = None node_publications = set() node_synonyms = set() node_xrefs = set() node_tui = None node_has_cui = False node_tui_category_label = None node_meta = onto_node_dict.get('meta', None) if node_meta is not None: node_deprecated = node_meta.get('deprecated', False) node_definition = node_meta.get('definition', None) if node_definition is not None: node_description = node_definition['val'] if node_description.startswith( 'OBSOLETE:') or node_description.startswith( 'Obsolete.'): continue node_definition_xrefs = node_definition.get('xrefs', None) if node_definition_xrefs is not None: assert type(node_definition_xrefs) == list for xref in node_definition_xrefs: xref_pub = xref_as_a_publication(xref) if xref_pub is not None: node_publications.add(xref_pub) node_synonyms_list = node_meta.get('synonyms', None) if node_synonyms_list is not None: for syn_dict in node_synonyms_list: syn_pred = syn_dict['pred'] if syn_pred == 'hasExactSynonym': node_synonyms.add(syn_dict['val']) syn_xrefs = syn_dict['xrefs'] if len(syn_xrefs) > 0: for syn_xref in syn_xrefs: syn_xref_pub = xref_as_a_publication( syn_xref) if syn_xref_pub is not None: node_publications.add(syn_xref_pub) node_xrefs_list = node_meta.get('xrefs', None) if node_xrefs_list is not None: for xref_dict in node_xrefs_list: xref_curie = xref_dict['val'] if xref_curie.startswith('MESH:'): xref_curie = xref_curie.replace('MESH:', 'MSH:') elif xref_curie.startswith('UMLS:C'): xref_curie = CUI_PREFIX + ':' + xref_curie.split( 'UMLS:')[1] node_xrefs.add(xref_curie) basic_property_values = node_meta.get('basicPropertyValues', None) if basic_property_values is not None: node_tui_list = [] for basic_property_value_dict in basic_property_values: bpv_pred = basic_property_value_dict['pred'] bpv_pred_curie = uri_to_curie_shortener(bpv_pred) if bpv_pred_curie is None: bpv_pred_curie = bpv_pred bpv_val = basic_property_value_dict['val'] if bpv_pred_curie in [ 'OIO:creation_date', 'dcterms:issued', 'HGNC:DATE_CREATED' ]: node_creation_date = bpv_val elif bpv_pred_curie == 'HGNC:DATE_LAST_MODIFIED': node_update_date = bpv_val elif bpv_pred_curie == 'IAL:0100001': assert node_deprecated node_replaced_by_uri = bpv_val node_replaced_by_curie = uri_to_curie_shortener( node_replaced_by_uri) elif bpv_pred_curie == 'UMLS:STY': # STY_BASE_IRI: node_tui_list.append(bpv_val) elif bpv_pred_curie == 'skos:prefLabel': if not node_curie_id.startswith('HGNC:'): node_name = bpv_val else: node_full_name = bpv_val if node_name is None: node_name = node_full_name elif bpv_pred_curie == 'skos:altLabel': node_synonyms.add(bpv_val) elif bpv_pred_curie == 'skos:definition': node_description = kg2_util.strip_html(bpv_val) elif bpv_pred_curie == 'HGNC:GENESYMBOL': node_name = bpv_val node_synonyms.add(bpv_val) elif bpv_pred_curie == 'UMLS:cui': node_has_cui = True if len(node_tui_list) == 1: node_tui = node_tui_list[0] node_tui_uri = posixpath.join( 'https://identifiers.org/umls/STY', node_tui) node_tui_curie = uri_to_curie_shortener(node_tui_uri) assert node_tui_curie is not None [node_tui_category_label, _] = get_biolink_category_for_node( node_tui_uri, node_tui_curie, ontology, curies_to_categories, uri_to_curie_shortener, set(), True) node_comments = node_meta.get('comments', None) if node_comments is not None: comments_str = 'COMMENTS: ' + (' // '.join(node_comments)) if node_description is not None: node_description += ' // ' + comments_str else: node_description = comments_str if node_category_label is None: node_type = onto_node_dict.get('type', None) if node_type is not None and node_type == 'PROPERTY': node_category_label = 'property' if node_category_label is None: if not node_deprecated: kg2_util.log_message("Node does not have a category", ontology.id, node_curie_id, output_stream=sys.stderr) node_category_label = 'unknown category' else: node_category_label = 'deprecated node' if node_has_cui: assert node_tui is not None or len(node_tui_list) > 0 if node_tui_category_label is None: node_tui_category_label = 'unknown category' if node_tui is not None: kg2_util.log_message( message='Node ' + ontology_node_id + ' has CUI whose TUI cannot be mapped to category: ' + node_tui) else: kg2_util.log_message( message='Node ' + ontology_node_id + ' has CUI with multiple associated TUIs: ' + ', '.join(node_tui_list)) else: if node_category_label is None: node_category_label = node_tui_category_label # override the node category label if we have a TUI node_tui_category_iri = category_label_to_iri_mapper( node_tui_category_label) ontology_curie_id = ontologies_iris_to_curies[iri_of_ontology] source_ontology_information = ret_dict.get(ontology_curie_id, None) if source_ontology_information is None: kg2_util.log_message( message= "ontology IRI has no information dictionary available", ontology_name=iri_of_ontology, output_stream=sys.stderr) assert False source_ontology_update_date = source_ontology_information[ 'update date'] if node_update_date is None: node_update_date = source_ontology_update_date if node_description is not None: node_description_xrefs_match = REGEX_XREF_END_DESCRIP.match( node_description) if node_description_xrefs_match is not None: node_description_xrefs_str = node_description_xrefs_match[ 1] node_description_xrefs_list = node_description_xrefs_str.split( ',') for node_description_xref_str in node_description_xrefs_list: node_description_xref_str = node_description_xref_str.strip( ) if ':' in node_description_xref_str: node_xrefs.add(node_description_xref_str) node_description_pubs = REGEX_PUBLICATIONS.findall( node_description) for pub_curie in node_description_pubs: node_publications.add(pub_curie) # deal with node names that are ALLCAPS if node_name is not None and node_name.isupper(): node_name = kg2_util.allcaps_to_only_first_letter_capitalized( node_name) node_dict = kg2_util.make_node(node_curie_id, iri, node_name, node_category_label, node_update_date, iri_of_ontology) node_dict['full name'] = node_full_name node_dict['description'] = node_description node_dict[ 'creation date'] = node_creation_date # slot name is not biolink standard node_dict[ 'deprecated'] = node_deprecated # slot name is not biolink standard node_dict[ 'replaced by'] = node_replaced_by_curie # slot name is not biolink standard node_dict['ontology node ids'] = [ ontology_node_id ] # slot name is not biolink standard node_dict['xrefs'] = list( node_xrefs) # slot name is not biolink standard node_dict['synonym'] = list( node_synonyms) # slot name is not biolink standard node_dict['publications'] = list(node_publications) # check if we need to make a CUI node if node_meta is not None and basic_property_values is not None: for basic_property_value_dict in basic_property_values: bpv_pred = basic_property_value_dict['pred'] bpv_pred_curie = uri_to_curie_shortener(bpv_pred) bpv_val = basic_property_value_dict['val'] if bpv_pred_curie == 'UMLS:cui': # CUI_BASE_IRI: cui_node_dict = dict(node_dict) cui_uri = bpv_pred + '/' + bpv_val cui_curie = uri_to_curie_shortener(cui_uri) assert cui_curie is not None assert not cui_curie.startswith('UMLS:C') # :DEBUG: # Skip this CUI if it's identical to the ontology node itself (happens with files created # using 'load_on_cuis' - part of fix for issue #565) if get_local_id_from_curie_id( cui_curie) == get_local_id_from_curie_id( node_curie_id): continue cui_node_dict['id'] = cui_curie cui_node_dict['iri'] = cui_uri cui_node_dict['synonym'] = [] cui_node_dict['category'] = node_tui_category_iri cui_node_dict[ 'category label'] = node_tui_category_label.replace( ' ', '_') cui_node_dict['ontology node ids'] = [] cui_node_dict['provided by'] = CUI_BASE_IRI cui_node_dict['xrefs'] = [ ] # blanking the "xrefs" here is *vital* in order to avoid issue #395 cui_node_dict_existing = ret_dict.get(cui_curie, None) if cui_node_dict_existing is not None: cui_node_dict = kg2_util.merge_two_dicts( cui_node_dict, cui_node_dict_existing) ret_dict[cui_curie] = cui_node_dict node_dict_xrefs = node_dict['xrefs'] node_dict_xrefs.append(cui_curie) node_dict['xrefs'] = list(set(node_dict_xrefs)) elif bpv_pred_curie == 'HGNC:ENTREZGENE_ID': entrez_gene_id = bpv_val entrez_node_dict = dict(node_dict) entrez_curie = 'NCBIGene:' + entrez_gene_id entrez_node_dict['id'] = entrez_curie entrez_node_dict[ 'iri'] = 'https://identifiers.org/NCBIGene/' + entrez_gene_id ret_dict[entrez_curie] = entrez_node_dict node_dict_xrefs = node_dict['xrefs'] node_dict_xrefs.append(entrez_curie) node_dict['xrefs'] = list(set(node_dict_xrefs)) if node_curie_id in ret_dict: node_dict = kg2_util.merge_two_dicts(ret_dict[node_curie_id], node_dict) ret_dict[node_curie_id] = node_dict return ret_dict