def test_simple():
    d = {}
    eqs = [(1, 2), (2, 3), (4, 5)]
    glom(d, eqs)
    assert len(d) == 5
    assert d[1] == d[2] == d[3] == {1, 2, 3}
    assert d[4] == d[5] == {4, 5}
def test_sets():
    d = {}
    eqs = [{1, 2}, set([2, 3]), set([4, 5]), set([6, 7])]
    oeqs = [{5, 7}]
    glom(d, eqs)
    glom(d, oeqs)
    assert d[1] == d[2] == d[3] == {1, 2, 3}
    assert d[4] == d[5] == d[6] == d[7] == {4, 5, 6, 7}
def test_two_calls():
    d = {}
    eqs = [(1, 2), (2, 3), (4, 5), (6, 7)]
    oeqs = [(5, 7)]
    glom(d, eqs)
    glom(d, oeqs)
    assert d[1] == d[2] == d[3] == {1, 2, 3}
    assert d[4] == d[5] == d[6] == d[7] == {4, 5, 6, 7}
def test_bigger_sets():
    d = {}
    eqs = [{1, 2, 3}, {4, 5, 6}]
    glom(d, eqs)
    assert d[1] == d[2] == d[3] == {1, 2, 3}
    assert d[4] == d[5] == d[6] == {4, 5, 6}
    eqs = [{3, 4, 6, 7}]
    glom(d, eqs)
    assert d[1] == d[2] == d[3] == d[4] == d[5] == d[6] == d[7] == {
        1, 2, 3, 4, 5, 6, 7
    }
Exemple #5
0
def uni_glom(unichem_data, prefix1, prefix2, chemdict):
    print(f'{prefix1}/{prefix2}')
    n = unichem_data.split('\n')[1:]
    if len(n[-1]) == 0:
        n = n[:-1]
    pairs = [ni.split('\t') for ni in n]
    for p in pairs:
        if p[0].startswith("'") or p[1].startswith("'"):
            print('UNI_GLOM {prefix1} {prefix2} {p}')
    curiepairs = [(f'{prefix1}:{p[0]}', f'{prefix2}:{p[1]}') for p in pairs]
    glom(chemdict, curiepairs)
def test_load_diseases_and_phenotypes(rosetta):
    mondo_sets = build_sets(
        rosetta.core.mondo,
        ['MONDO:0004979', 'MONDO:0004784', 'MONDO:0004765'])
    #hpo_sets = build_sets(rosetta.core.hpo,['HP:0002099'])
    dicts = {}
    glom(dicts, mondo_sets)
    print("*", dicts['MONDO:0004979'])
    print("*", dicts['MONDO:0004784'])
    print("*", dicts['MONDO:0004765'])
    assert dicts['MONDO:0004979'] != dicts['MONDO:0004784'] != dicts[
        'MONDO:0004765']
Exemple #7
0
def load_chemicals(rosetta, refresh=False):
    #Build if need be
    if refresh:
        refresh_mesh_pubchem(rosetta)
    #Get all the simple stuff
    concord = load_unichem()
    #DO MESH/UNII
    mesh_unii_file = os.path.join(os.path.dirname(__file__),
                                  'mesh_to_unii.txt')
    mesh_unii_pairs = load_pairs(mesh_unii_file, 'UNII')
    glom(concord, mesh_unii_pairs)
    #DO MESH/PUBCHEM
    mesh_pc_file = os.path.join(os.path.dirname(__file__),
                                'mesh_to_pubchem.txt')
    mesh_pc_pairs = load_pairs(mesh_pc_file, 'PUBCHEM')
    glom(concord, mesh_pc_pairs)
    #Dump
    with open('chemconc.txt', 'w') as outf:
        for key in concord:
            outf.write(f'{key}\t{concord[key]}\n')
    dump_cache(concord, rosetta)
Exemple #8
0
def load_diseases_and_phenotypes(rosetta):
    mondo_sets = build_sets(rosetta.core.mondo)
    hpo_sets = build_sets(rosetta.core.hpo)
    meddra_umls = read_meddra()
    dicts = {}
    glom(dicts, mondo_sets)
    glom(dicts, hpo_sets)
    glom(dicts, meddra_umls)
    dump_cache(dicts, rosetta)
Exemple #9
0
def load_diseases_and_phenotypes(rosetta):
    print('disease/phenotype')
    print('get and write mondo sets')
    mondo_sets = build_exact_sets(rosetta.core.mondo,rosetta.core.uberongraph)
    write_sets(mondo_sets,'mondo_sets.txt')
    print('get and write hp sets')
    hpo_sets = build_sets(rosetta.core.hpo, ignore_list = ['ICD','NCIT'])
    hpo_sets = filter_out_non_unique_ids(hpo_sets)
    write_sets(hpo_sets,'hpo_sets.txt')
    print('get and write umls sets')
    meddra_umls = read_meddra()
    write_sets(hpo_sets,'meddra_umls_sets.txt')
    dicts = {}
    print('put it all together')
    glom(dicts,mondo_sets)
    write_dicts(dicts,'mondo_dicts.txt')
    glom(dicts,hpo_sets)
    write_dicts(dicts,'mondo_hpo_dicts.txt')
    glom(dicts,meddra_umls)
    write_dicts(dicts,'mondo_hpo_meddra_dicts.txt')
    print('dump it')
    with open('disease.txt','w') as outf:
        dump_cache(dicts,rosetta,outf)
Exemple #10
0
def load_chemicals(rosetta, refresh=True):
    # Build if need be
    if refresh:
        refresh_mesh_pubchem(rosetta)
    #Get all the simple stuff
    # 1. Handle all the stuff that has an InchiKey using unichem
    # 2. Mesh is all "no structure".  We try to use a variety of sources to hook mesh id's to anything else
    print('UNICHEM')
    concord = load_unichem()
    # 2. Mesh is all "no structure".  We try to use a variety of sources to hook mesh id's to anything else
    #DO MESH/UNII
    print('MESH/UNII')
    mesh_unii_file = os.path.join(os.path.dirname(__file__),
                                  'mesh_to_unii.txt')
    mesh_unii_pairs = load_pairs(mesh_unii_file, 'UNII')
    glom(concord, mesh_unii_pairs)
    # DO MESH/PUBCHEM
    print('MESH/PUBCHEM')
    mesh_pc_file = os.path.join(os.path.dirname(__file__),
                                'mesh_to_pubchem.txt')
    mesh_pc_pairs = load_pairs(mesh_pc_file, 'PUBCHEM')
    glom(concord, mesh_pc_pairs)
    # DO MESH/CHEBI, but don't combine any chebi's into a set with it
    print('MESH/CHEBI')
    mesh_chebi = pull_mesh_chebi()
    #Merging CHEBIS can be ok because of primary/secondary chebis.  Really we
    # don't want to merge INCHIs
    #glom(concord, mesh_chebi,['INCHI'])
    glom(concord, mesh_chebi)
    # 3. Pull from chebi the sdf and db files, use them to link to things (KEGG) in the no inchi/no smiles cases
    pubchem_chebi_pairs, kegg_chebi_pairs = pull_chebi()
    glom(concord, pubchem_chebi_pairs)
    glom(concord, kegg_chebi_pairs)
    # 4. Go to KEGG, and get sequences for peptides.
    sequence_concord = rosetta.core.kegg.pull_sequences()
    # 5. Pull UniProt (swissprot) XML.
    # Calculate sequences for the sub-sequences (Uniprot_PRO)
    sequence_to_uniprot = pull_uniprot(refresh)
    # 6. Use the sequences to merge UniProt with KEGG
    for s, v in sequence_to_uniprot.items():
        sequence_concord[s].update(v)
    # 7. Read IUPHAR, discard things with INCHI, use things with sequence to match UniProt_PRO/KEGG
    #     Use the hand-curated version of IUPHAR to match the un-sequenced stuff left over
    sequence_to_iuphar, iuphar_glom = pull_iuphar()
    for s, v in sequence_to_iuphar.items():
        sequence_concord[s].update(v)
    glom(concord, iuphar_glom)
    #  8. Use wikidata to get links between CHEBI and UniProt_PRO
    unichebi = pull_uniprot_chebi()
    glom(concord, unichebi)
    #  9. glom across sequence and chemical stuff
    new_groups = sequence_concord.values()
    glom(concord, new_groups, unique_prefixes=['GTOPDB', 'INCHI'])
    # 10. Drop PRO only sequences.
    to_remove = []
    for eq_id_set in concord:
        if len(eq_id_set) > 1:
            continue
        print(eq_id_set)
        item = iter(eq_id_set).next()
        if '#PRO_' in item:
            to_remove.add(eq_id_set)
    for eids in to_remove:
        concord.remove(eids)
    #Add labels to CHEBIs, CHEMBLs, MESHes
    print('LABEL')
    label_chebis(concord)
    label_chembls(concord, refresh=refresh)
    label_meshes(concord)
    print('dumping')
    #Dump
    with open('chemconc.txt', 'w') as outf:
        dump_cache(concord, rosetta, outf)
    #dump_cache(concord,rosetta)
    print('done')