Esempio n. 1
0
def update_pubchem_mesh_map():
    url = 'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-MeSH'
    res = requests.get(url)

    # We first get mapping pairs from the table
    mappings = []
    for line in res.text.split('\n'):
        parts = line.split('\t')
        for part in parts[1:]:
            mappings.append((parts[0], part))
    # The table has (1) rows with multiple MeSH terms separated by tabs,
    # (2) multiple rows with the same PubChem CID and (3) multiple rows
    # with the same MeSH term. We retain only one-to-one mappings here.
    pc_count = Counter([m[0] for m in mappings])
    mesh_count = Counter([m[1] for m in mappings])
    unique_mappings = [m for m in mappings if pc_count[m[0]] == 1
                       and mesh_count[m[1]] == 1]

    # The mappings table is given using MeSH term names so we need
    # to convert these to IDs. Lookups can fail for several reasons:
    # some entries are simply not valid MeSH names, others are not
    # yet included in the INDRA MeSH resources/ontology.
    unique_with_id = []
    for pcid, meshname in unique_mappings:
        mesh_ns_id_tuple = bio_ontology.get_id_from_name('MESH', meshname)
        if mesh_ns_id_tuple:
            unique_with_id.append((pcid, mesh_ns_id_tuple[1]))

    fname = os.path.join(path, 'pubchem_mesh_map.tsv')
    logger.info('Saving into %s' % fname)
    write_unicode_csv(fname, unique_with_id, delimiter='\t')
Esempio n. 2
0
def get_ligands():
    # Read and extract cell surface proteins from CSPA DB
    wb = openpyxl.load_workbook(SURFACE_PROTEINS_WB)
    surface_protein_set = set(row[4].value for row in wb['Sheet 1']
                              if row[6].value == 'yes')
    updated_surface_protein_set = {
        hgnc_client.get_hgnc_name(hgnc_client.get_current_hgnc_id(g))
        for g in surface_protein_set
    } - {None}
    logger.info('Got %d surface proteins from spreadsheet' %
                len(surface_protein_set))
    ligand_terms = [
        'cytokine activity', 'hormone activity', 'growth factor activity',
        'extracellular matrix structural constituent'
    ]
    # Getting GO id's for ligands and receptors by using
    # GO terms
    ligand_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in ligand_terms
    ]
    ligand_go_ids = expand_with_child_go_terms(ligand_go_ids)

    # Converting GO id's to gene symbols
    ligand_genes_go = get_genes_for_go_ids(ligand_go_ids)
    manual_ligands = set()
    ligand_genes_go = updated_surface_protein_set | ligand_genes_go | manual_ligands | get_cpdb_ligands(
    )
    ligand_genes_go = {
        hgnc_client.get_hgnc_name(hgnc_client.get_current_hgnc_id(g))
        for g in ligand_genes_go
    } - {None}
    ligand_genes_go = ligand_genes_go - (get_cpdb_receptors()
                                         | get_ion_channels())
    return ligand_genes_go
def get_receptors():
    receptor_terms = ['signaling receptor activity']
    receptor_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms
    ]
    receptor_go_ids = expand_with_child_go_terms(receptor_go_ids)
    # Filtering out the nuclear receptors from the receptor list
    receptor_go_ids = {r for r in receptor_go_ids if 'receptor' in
                       bio_ontology.get_name('GO', r)} - \
                      expand_with_child_go_terms(['GO:0004879'])
    receptor_genes_go = get_genes_for_go_ids(receptor_go_ids)
    receptor_genes_go -= {'NR2C2', 'EGF'}
    # Add ION channels to the receptor list
    ion_channels = set()
    with open(ION_CHANNELS, 'r') as fh:
        for line in fh:
            ion_channels.add(line.strip())
    receptor_genes_go |= ion_channels
    return receptor_genes_go
Esempio n. 4
0
def get_go_receptors():
    receptor_terms = ['signaling receptor activity']
    receptor_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms
    ]
    receptor_go_ids = expand_with_child_go_terms(receptor_go_ids)
    # Filtering out the nuclear receptors from the receptor list
    receptor_go_ids = {
        r
        for r in receptor_go_ids
        if 'receptor' in bio_ontology.get_name('GO', r)
        or 'sensor' in bio_ontology.get_name('GO', r)
        or 'channel' in bio_ontology.get_name('GO', r)
    }
    nuclear_receptor_go_ids = expand_with_child_go_terms(['GO:0004879'])
    receptor_genes_go = get_genes_for_go_ids(receptor_go_ids) - \
        get_genes_for_go_ids(nuclear_receptor_go_ids)
    receptor_genes_go -= {'NR2C2', 'EGF'}
    return receptor_genes_go
def get_ligands():
    # Read and extract cell surface proteins from CSPA DB
    wb = openpyxl.load_workbook(SURFACE_PROTEINS_WB)
    surface_protein_set = set(row[4].value for row in wb['Sheet 1']
                              if row[6].value == 'yes')
    logger.info('Got %d surface proteins from spreadsheet' %
                len(surface_protein_set))
    ligand_terms = [
        'cytokine activity', 'hormone activity', 'growth factor activity',
        'extracellular matrix structural constituent'
    ]
    # Getting GO id's for ligands and receptors by using
    # GO terms
    ligand_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in ligand_terms
    ]
    ligand_go_ids = expand_with_child_go_terms(ligand_go_ids)

    # Converting GO id's to gene symbols
    ligand_genes_go = get_genes_for_go_ids(ligand_go_ids)
    # Remove one more nuclear receptor
    #manual_ligands = {'THBS1'}
    manual_ligands = set()
    return surface_protein_set | ligand_genes_go | manual_ligands
Esempio n. 6
0
def test_name_lookup_obsolete():
    # This is a regression test to make sure we don't return another node
    # with the same name but which is obsolete (HGNC:11093)
    assert bio_ontology.get_id_from_name('HGNC', 'ALDH3A2') == \
        ('HGNC', '403')
Esempio n. 7
0
 def get_identifier(namespace: str, name: str) -> str:
     _, identifier = bio_ontology.get_id_from_name(namespace, name)
     return identifier
Esempio n. 8
0
    # Read and extract cell surface proteins from CSPA DB
    wb = openpyxl.load_workbook(SURFACE_PROTEINS_WB)
    surface_protein_set = set(row[4].value for row in wb['Sheet 1']
                              if row[6].value == 'yes')

    logger.info('Got %d surface proteins from spreadsheet' %
                len(surface_protein_set))
    ligand_terms = [
        'cytokine activity', 'hormone activity', 'growth factor activity'
    ]
    receptor_terms = ['signaling receptor activity']

    # Getting GO id's for ligands and receptors by using
    # GO terms
    ligand_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in ligand_terms
    ]
    receptor_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms
    ]

    # Converting GO id's to gene symbols
    ligand_genes_go = get_genes_for_go_ids(ligand_go_ids, GOA)
    receptor_genes_go = get_genes_for_go_ids(receptor_go_ids, GOA)
    manual_ligands = {'THBS1'}

    # remove all the receptors from the surface_protein_set
    full_ligand_set = \
        (surface_protein_set - receptor_genes_go) | ligand_genes_go | \
        manual_ligands