def expand_disont_disease(orangeboard, node): disont_id = node.name child_disease_ids_dict = QueryDisont.query_disont_to_child_disonts_desc( disont_id) for child_disease_id in child_disease_ids_dict.keys(): target_node = orangeboard.add_node( 'disont_disease', child_disease_id, desc=child_disease_ids_dict[child_disease_id]) orangeboard.add_rel('is_parent_of', 'DiseaseOntology', node, target_node) mesh_ids_set = QueryDisont.query_disont_to_mesh_id(disont_id) for mesh_id in mesh_ids_set: uniprot_ids_dict = QueryDisGeNet.query_mesh_id_to_uniprot_ids_desc( mesh_id) for uniprot_id in uniprot_ids_dict.keys(): source_node = orangeboard.add_node( 'uniprot_protein', uniprot_id, desc=uniprot_ids_dict[uniprot_id]) orangeboard.add_rel('gene_assoc_with', 'DisGeNet', source_node, node) ## query for phenotypes associated with this disease phenotype_id_dict = QueryBioLink.get_phenotypes_for_disease_desc(disont_id) for phenotype_id_str in phenotype_id_dict.keys(): phenotype_node = orangeboard.add_node( 'phenont_phenotype', phenotype_id_str, desc=phenotype_id_dict[phenotype_id_str]) orangeboard.add_rel('phenotype_assoc_with', 'BioLink', phenotype_node, node)
def expand_disease(self, node): assert node.nodetype == "disease" disease_name = node.name gene_ontology_dict = QuerySciGraph.get_gene_ontology_curie_ids_for_disease_curie_id(disease_name) for gene_ontology_curie_id_str, gene_ontology_term_dict in gene_ontology_dict.items(): gene_ontology_type_str = gene_ontology_term_dict["ontology"].replace(" ", "_") target_node = self.add_node_smart(gene_ontology_type_str, gene_ontology_curie_id_str, desc=gene_ontology_term_dict["name"]) if target_node is not None: predicate_str = gene_ontology_term_dict["predicate"].replace(" ", "_") self.orangeboard.add_rel("affects", "Monarch_SciGraph", node, target_node, extended_reltype=predicate_str) if "OMIM:" in disease_name: self.expand_genetic_condition(node) return if "MONDO:" in disease_name: self.expand_mondo_disease(node) return # if we get here, this is a Disease Ontology disease disont_id = disease_name child_disease_ids_dict = QueryDisont.query_disont_to_child_disonts_desc(disont_id) for child_disease_id in child_disease_ids_dict.keys(): target_node = self.add_node_smart('disease', child_disease_id, desc=child_disease_ids_dict[child_disease_id]) if target_node is not None: self.orangeboard.add_rel('subclass_of', 'DiseaseOntology', target_node, node, extended_reltype="subclass_of") mesh_ids_set = QueryDisont.query_disont_to_mesh_id(disont_id) for mesh_id in mesh_ids_set: uniprot_ids_dict = QueryDisGeNet.query_mesh_id_to_uniprot_ids_desc(mesh_id) for uniprot_id in uniprot_ids_dict.keys(): assert '-' not in uniprot_id source_node = self.add_node_smart('protein', uniprot_id, desc=uniprot_ids_dict[uniprot_id]) if source_node is not None: self.orangeboard.add_rel("gene_associated_with_condition", "DisGeNet", source_node, node, extended_reltype="gene_associated_with_condition") # query for phenotypes associated with this disease phenotype_id_dict = QueryBioLink.get_phenotypes_for_disease_desc(disont_id) for phenotype_id_str in phenotype_id_dict.keys(): phenotype_node = self.add_node_smart("phenotypic_feature", phenotype_id_str, desc=phenotype_id_dict[phenotype_id_str]) if phenotype_node is not None: self.orangeboard.add_rel("has_phenotype", 'BioLink', node, phenotype_node, extended_reltype="has_phenotype")
def test_query_disont_to_child_disonts_desc(self): ret_dict = QD.query_disont_to_child_disonts_desc( "DOID:9352") # type 2 diabetes mellitus known_dict = { 'DOID:1837': 'diabetic ketoacidosis', 'DOID:10182': 'diabetic peripheral angiopathy', 'DOID:11712': 'lipoatrophic diabetes' } self.assertDictEqual(ret_dict, known_dict)
def get_mesh_term_for_all(curie_id, description): """ Takes a curie ID, detects the ontology from the curie id, and then finds the mesh term Params: curie_id - A string containing the curie id of the node. Formatted <source abbreviation>:<number> e.g. DOID:8398 description - A string containing the English name for the node current functionality (+ means has it, - means does not have it) "Reactome" + "GO" - found gene conversion but no biological process conversion "UniProt" + "HP" - + "UBERON" + "CL" - not supposed to be here? "NCBIGene" + "DOID" + "OMIM" + "ChEMBL" + """ if type(description) != str: description = str(description) curie_list = curie_id.split(':') names = None if QueryNCBIeUtils.is_mesh_term(description): return [description + '[MeSH Terms]'] names = NormGoogleDistance.get_mesh_from_oxo(curie_id) if names is None: if curie_list[0].lower().startswith("react"): res = QueryNCBIeUtils.get_reactome_names(curie_list[1]) if res is not None: names = res.split('|') elif curie_list[0] == "GO": pass elif curie_list[0].startswith("UniProt"): res = QueryNCBIeUtils.get_uniprot_names(curie_list[1]) if res is not None: names = res.split('|') elif curie_list[0] == "HP": names = QueryNCBIeUtils.get_mesh_terms_for_hp_id(curie_id) elif curie_list[0] == "UBERON": if curie_id.endswith('PHENOTYPE'): curie_id = curie_id[:-9] mesh_id = QueryEBIOLS.get_mesh_id_for_uberon_id(curie_id) names = [] for entry in mesh_id: if len(entry.split('.')) > 1: uids=QueryNCBIeUtils.get_mesh_uids_for_mesh_tree(entry.split(':')[1]) for uid in uids: try: uid_num = int(uid.split(':')[1][1:]) + 68000000 names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num) except IndexError: uid_num = int(uid) names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num) else: try: uid = entry.split(':')[1] uid_num = int(uid[1:]) + 68000000 names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num) except IndexError: uid_num = int(entry) names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num) if len(names) == 0: names = None else: names[0] = names[0] + '[MeSH Terms]' elif curie_list[0] == "NCBIGene": gene_id = curie_id.split(':')[1] names = QueryNCBIeUtils.get_pubmed_from_ncbi_gene(gene_id) elif curie_list[0] == "DOID": mesh_id = QueryDisont.query_disont_to_mesh_id(curie_id) names = [] for uid in mesh_id: uid_num = int(uid[1:]) + 68000000 name = QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num) if name is not None: names += name if len(names) == 0: names = None else: names[0] = names[0] + '[MeSH Terms]' elif curie_list[0] == "OMIM": names = QueryNCBIeUtils.get_mesh_terms_for_omim_id(curie_list[1]) elif curie_list[0] == "ChEMBL": chembl_id = curie_id.replace(':', '').upper() mesh_id = QueryMyChem.get_mesh_id(chembl_id) if mesh_id is not None: mesh_id = int(mesh_id[1:]) + 68000000 names = QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(mesh_id) if names is not None: if type(names) == list: for name in names: if name.endswith('[MeSH Terms]'): return [name] return names return [description.replace(';', '|')]
def seed_and_expand_kg_q2(num_expansions=3, seed_parts=None): drug_dis_df = pandas.read_csv('../../data/q2/q2-drugandcondition-list.txt', sep='\t') if seed_parts is None or 'conditions' in seed_parts: print('=====================> seeding disease nodes for Q2') first_row = True mesh_terms_set = set() mesh_term_to_curie_ids_dict = dict() curie_ids_for_df = [] for index, row in drug_dis_df.iterrows(): mesh_term = row['Condition'] if mesh_term not in mesh_terms_set: mesh_term_to_curie_ids_dict[mesh_term] = None mesh_terms_set.add(mesh_term) curie_ids = get_curie_ont_ids_for_mesh_term(mesh_term) if len(curie_ids) > 0: assert type(curie_ids) == list for curie_id in curie_ids: if 'DOID:' in curie_id: disont_desc = QueryDisont.query_disont_to_label( curie_id) ob.add_node('disont_disease', curie_id, desc=disont_desc, seed_node_bool=first_row) mesh_term_to_curie_ids_dict[mesh_term] = curie_id first_row = False else: if 'HP:' in curie_id: ob.add_node('phenont_phenotype', curie_id, desc=mesh_term, seed_node_bool=first_row) mesh_term_to_curie_ids_dict[ mesh_term] = curie_id first_row = False else: assert False ## should never get here curie_ids_for_df.append(mesh_term_to_curie_ids_dict[mesh_term]) drug_dis_df['CURIE_ID'] = pandas.Series(curie_ids_for_df, index=drug_dis_df.index) drug_dis_df.to_csv( '../../data/q2/q2-drugandcondition-list-mapped-output.txt', sep='\t') ## triple-expand the knowledge graph for _ in range(0, num_expansions): bne.expand_all_nodes() if seed_parts is None or 'drugs' in seed_parts: print('=====================> seeding drug nodes for Q2') first_row = True all_drugs = set() for index, row in drug_dis_df.iterrows(): drug_name = row['Drug'].lower() all_drugs.add(drug_name) fda_drug_df = pandas.read_csv('../../data/q2/drugset2017_filt.txt', sep='\t') for index, row in fda_drug_df.iterrows(): drug_name = row['NAME'].lower() all_drugs.add(drug_name) for drug_name in all_drugs: print(drug_name) chembl_ids = QueryChEMBL.get_chembl_ids_for_drug(drug_name) if chembl_ids is not None and len(chembl_ids) > 0: chembl_id = next(iter(chembl_ids)) else: chembl_id = '' ob.add_node('pharos_drug', drug_name, desc=chembl_id, seed_node_bool=first_row) first_row = False ## triple-expand the knowledge graph for _ in range(0, num_expansions): bne.expand_all_nodes()
def setUpClass(cls): cls.disont = QueryDisont()
def test_query_disont_to_child_disonts(self): ret_set = QD.query_disont_to_child_disonts('DOID:9352') known_set = {11712, 1837, 10182} self.assertSetEqual(ret_set, known_set)
def test_query_disont_to_label(self): ret_label = QD.query_disont_to_label("DOID:0050741") self.assertEqual(ret_label, "alcohol dependence")