def test_simple(): d = {} eqs = [(1, 2), (2, 3), (4, 5)] glom(d, eqs) assert len(d) == 5 assert d[1] == d[2] == d[3] == {1, 2, 3} assert d[4] == d[5] == {4, 5}
def test_sets(): d = {} eqs = [{1, 2}, set([2, 3]), set([4, 5]), set([6, 7])] oeqs = [{5, 7}] glom(d, eqs) glom(d, oeqs) assert d[1] == d[2] == d[3] == {1, 2, 3} assert d[4] == d[5] == d[6] == d[7] == {4, 5, 6, 7}
def test_two_calls(): d = {} eqs = [(1, 2), (2, 3), (4, 5), (6, 7)] oeqs = [(5, 7)] glom(d, eqs) glom(d, oeqs) assert d[1] == d[2] == d[3] == {1, 2, 3} assert d[4] == d[5] == d[6] == d[7] == {4, 5, 6, 7}
def test_bigger_sets(): d = {} eqs = [{1, 2, 3}, {4, 5, 6}] glom(d, eqs) assert d[1] == d[2] == d[3] == {1, 2, 3} assert d[4] == d[5] == d[6] == {4, 5, 6} eqs = [{3, 4, 6, 7}] glom(d, eqs) assert d[1] == d[2] == d[3] == d[4] == d[5] == d[6] == d[7] == { 1, 2, 3, 4, 5, 6, 7 }
def uni_glom(unichem_data, prefix1, prefix2, chemdict): print(f'{prefix1}/{prefix2}') n = unichem_data.split('\n')[1:] if len(n[-1]) == 0: n = n[:-1] pairs = [ni.split('\t') for ni in n] for p in pairs: if p[0].startswith("'") or p[1].startswith("'"): print('UNI_GLOM {prefix1} {prefix2} {p}') curiepairs = [(f'{prefix1}:{p[0]}', f'{prefix2}:{p[1]}') for p in pairs] glom(chemdict, curiepairs)
def test_load_diseases_and_phenotypes(rosetta): mondo_sets = build_sets( rosetta.core.mondo, ['MONDO:0004979', 'MONDO:0004784', 'MONDO:0004765']) #hpo_sets = build_sets(rosetta.core.hpo,['HP:0002099']) dicts = {} glom(dicts, mondo_sets) print("*", dicts['MONDO:0004979']) print("*", dicts['MONDO:0004784']) print("*", dicts['MONDO:0004765']) assert dicts['MONDO:0004979'] != dicts['MONDO:0004784'] != dicts[ 'MONDO:0004765']
def load_chemicals(rosetta, refresh=False): #Build if need be if refresh: refresh_mesh_pubchem(rosetta) #Get all the simple stuff concord = load_unichem() #DO MESH/UNII mesh_unii_file = os.path.join(os.path.dirname(__file__), 'mesh_to_unii.txt') mesh_unii_pairs = load_pairs(mesh_unii_file, 'UNII') glom(concord, mesh_unii_pairs) #DO MESH/PUBCHEM mesh_pc_file = os.path.join(os.path.dirname(__file__), 'mesh_to_pubchem.txt') mesh_pc_pairs = load_pairs(mesh_pc_file, 'PUBCHEM') glom(concord, mesh_pc_pairs) #Dump with open('chemconc.txt', 'w') as outf: for key in concord: outf.write(f'{key}\t{concord[key]}\n') dump_cache(concord, rosetta)
def load_diseases_and_phenotypes(rosetta): mondo_sets = build_sets(rosetta.core.mondo) hpo_sets = build_sets(rosetta.core.hpo) meddra_umls = read_meddra() dicts = {} glom(dicts, mondo_sets) glom(dicts, hpo_sets) glom(dicts, meddra_umls) dump_cache(dicts, rosetta)
def load_diseases_and_phenotypes(rosetta): print('disease/phenotype') print('get and write mondo sets') mondo_sets = build_exact_sets(rosetta.core.mondo,rosetta.core.uberongraph) write_sets(mondo_sets,'mondo_sets.txt') print('get and write hp sets') hpo_sets = build_sets(rosetta.core.hpo, ignore_list = ['ICD','NCIT']) hpo_sets = filter_out_non_unique_ids(hpo_sets) write_sets(hpo_sets,'hpo_sets.txt') print('get and write umls sets') meddra_umls = read_meddra() write_sets(hpo_sets,'meddra_umls_sets.txt') dicts = {} print('put it all together') glom(dicts,mondo_sets) write_dicts(dicts,'mondo_dicts.txt') glom(dicts,hpo_sets) write_dicts(dicts,'mondo_hpo_dicts.txt') glom(dicts,meddra_umls) write_dicts(dicts,'mondo_hpo_meddra_dicts.txt') print('dump it') with open('disease.txt','w') as outf: dump_cache(dicts,rosetta,outf)
def load_chemicals(rosetta, refresh=True): # Build if need be if refresh: refresh_mesh_pubchem(rosetta) #Get all the simple stuff # 1. Handle all the stuff that has an InchiKey using unichem # 2. Mesh is all "no structure". We try to use a variety of sources to hook mesh id's to anything else print('UNICHEM') concord = load_unichem() # 2. Mesh is all "no structure". We try to use a variety of sources to hook mesh id's to anything else #DO MESH/UNII print('MESH/UNII') mesh_unii_file = os.path.join(os.path.dirname(__file__), 'mesh_to_unii.txt') mesh_unii_pairs = load_pairs(mesh_unii_file, 'UNII') glom(concord, mesh_unii_pairs) # DO MESH/PUBCHEM print('MESH/PUBCHEM') mesh_pc_file = os.path.join(os.path.dirname(__file__), 'mesh_to_pubchem.txt') mesh_pc_pairs = load_pairs(mesh_pc_file, 'PUBCHEM') glom(concord, mesh_pc_pairs) # DO MESH/CHEBI, but don't combine any chebi's into a set with it print('MESH/CHEBI') mesh_chebi = pull_mesh_chebi() #Merging CHEBIS can be ok because of primary/secondary chebis. Really we # don't want to merge INCHIs #glom(concord, mesh_chebi,['INCHI']) glom(concord, mesh_chebi) # 3. Pull from chebi the sdf and db files, use them to link to things (KEGG) in the no inchi/no smiles cases pubchem_chebi_pairs, kegg_chebi_pairs = pull_chebi() glom(concord, pubchem_chebi_pairs) glom(concord, kegg_chebi_pairs) # 4. Go to KEGG, and get sequences for peptides. sequence_concord = rosetta.core.kegg.pull_sequences() # 5. Pull UniProt (swissprot) XML. # Calculate sequences for the sub-sequences (Uniprot_PRO) sequence_to_uniprot = pull_uniprot(refresh) # 6. Use the sequences to merge UniProt with KEGG for s, v in sequence_to_uniprot.items(): sequence_concord[s].update(v) # 7. Read IUPHAR, discard things with INCHI, use things with sequence to match UniProt_PRO/KEGG # Use the hand-curated version of IUPHAR to match the un-sequenced stuff left over sequence_to_iuphar, iuphar_glom = pull_iuphar() for s, v in sequence_to_iuphar.items(): sequence_concord[s].update(v) glom(concord, iuphar_glom) # 8. Use wikidata to get links between CHEBI and UniProt_PRO unichebi = pull_uniprot_chebi() glom(concord, unichebi) # 9. glom across sequence and chemical stuff new_groups = sequence_concord.values() glom(concord, new_groups, unique_prefixes=['GTOPDB', 'INCHI']) # 10. Drop PRO only sequences. to_remove = [] for eq_id_set in concord: if len(eq_id_set) > 1: continue print(eq_id_set) item = iter(eq_id_set).next() if '#PRO_' in item: to_remove.add(eq_id_set) for eids in to_remove: concord.remove(eids) #Add labels to CHEBIs, CHEMBLs, MESHes print('LABEL') label_chebis(concord) label_chembls(concord, refresh=refresh) label_meshes(concord) print('dumping') #Dump with open('chemconc.txt', 'w') as outf: dump_cache(concord, rosetta, outf) #dump_cache(concord,rosetta) print('done')