Esempio n. 1
0
def get_ligands():
    # Read and extract cell surface proteins from CSPA DB
    wb = openpyxl.load_workbook(SURFACE_PROTEINS_WB)
    surface_protein_set = set(row[4].value for row in wb['Sheet 1']
                              if row[6].value == 'yes')
    updated_surface_protein_set = {
        hgnc_client.get_hgnc_name(hgnc_client.get_current_hgnc_id(g))
        for g in surface_protein_set
    } - {None}
    logger.info('Got %d surface proteins from spreadsheet' %
                len(surface_protein_set))
    ligand_terms = [
        'cytokine activity', 'hormone activity', 'growth factor activity',
        'extracellular matrix structural constituent'
    ]
    # Getting GO id's for ligands and receptors by using
    # GO terms
    ligand_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in ligand_terms
    ]
    ligand_go_ids = expand_with_child_go_terms(ligand_go_ids)

    # Converting GO id's to gene symbols
    ligand_genes_go = get_genes_for_go_ids(ligand_go_ids)
    manual_ligands = set()
    ligand_genes_go = updated_surface_protein_set | ligand_genes_go | manual_ligands | get_cpdb_ligands(
    )
    ligand_genes_go = {
        hgnc_client.get_hgnc_name(hgnc_client.get_current_hgnc_id(g))
        for g in ligand_genes_go
    } - {None}
    ligand_genes_go = ligand_genes_go - (get_cpdb_receptors()
                                         | get_ion_channels())
    return ligand_genes_go
Esempio n. 2
0
def match_reactome(z_sc, reactome_dict):
    logger.info('Generating generator')
    corr_iterator = corr_matrix_to_generator(z_sc)
    res = {
        'agA_hgnc': [],
        'agA_up': [],
        'agB_hgnc': [],
        'agB_up': [],
        'z_sc': [],
        'has_pathways': [],
        'common_pathways': []
    }
    logger.info('Looping correlations')
    for a, b, corr in corr_iterator:
        hgnc_id_a = get_current_hgnc_id(a)
        if isinstance(hgnc_id_a, list):
            ix = 0
            while True:
                try:
                    a_up = get_uniprot_id(hgnc_id_a[ix])
                except IndexError:
                    a_up = None
                    break
                if a_up is None:
                    ix += 1
        else:
            a_up = get_uniprot_id(hgnc_id_a)
        if a_up is None:
            continue

        hgnc_id_b = get_current_hgnc_id(b)
        if isinstance(hgnc_id_b, list):
            ix = 0
            while True:
                try:
                    b_up = get_uniprot_id(hgnc_id_b[ix])
                except IndexError:
                    b_up = None
                    break
                if b_up is None:
                    ix += 1
        else:
            b_up = get_uniprot_id(hgnc_id_b)
        if b_up is None:
            continue

        common_reactome = set(reactome_dict.get(a_up, [])) & \
                          set(reactome_dict.get(b_up, []))
        res['agA_hgnc'].append(a)
        res['agA_up'].append(a_up)
        res['agB_hgnc'].append(b)
        res['agB_up'].append(b_up)
        res['z_sc'].append(corr)
        res['common_pathways'].append(common_reactome)
        res['has_pathways'].append(bool(common_reactome))
    logger.info('Returning results')
    return res
Esempio n. 3
0
def test_get_current_id():
    # Current symbol
    assert hgnc_client.get_current_hgnc_id('BRAF') == '1097'
    # Outdated symbol, one ID
    assert hgnc_client.get_current_hgnc_id('SEPT7') == '1717'
    # Outdated symbol, multiple IDs
    ids = hgnc_client.get_current_hgnc_id('HOX1')
    assert len(ids) == 10
    assert '5101' in ids
Esempio n. 4
0
 def _replace_outdated_hgnc_symbols(self, pc_old, pc_current):
     logger.info('Replacing outdated HGNC symbols in %s and save as %s' % \
                 (pc_old, pc_current))
     pc = pandas.read_csv(pc_old, sep='\t', dtype=str, header=None)
     col_mapper = {}
     col_mapper[0] = 'source'
     col_mapper[1] = 'rel_type'
     col_mapper[2] = 'target'
     pc = pc.rename(mapper=col_mapper, axis='columns')
     all_symbols = set(pc['source']).union(pc['target'])
     symbol_map = {}
     for sym in all_symbols:
         if not sym.startswith('CHEBI:'):
             hgnc_id = hgnc_client.get_current_hgnc_id(sym)
             if not hgnc_id:
                 continue
             elif isinstance(hgnc_id, list):
                 #outdated gene symbol is ambiguous: maps to multiple genes
                 continue
             latest_symbol = hgnc_client.get_hgnc_name(hgnc_id)
             if latest_symbol != sym:
                 symbol_map[sym] = latest_symbol
     if symbol_map:
         pc.replace(symbol_map, inplace=True)
     pc.to_csv(pc_current, sep='\t', header=False, index=False)
     os.remove(pc_old)
Esempio n. 5
0
def _get_genes(
    record: Mapping[str, Any],
    prefix: str,
    key: str,
) -> List[Tuple[str, str, str]]:
    rv = []
    #: A list of 2-tuples with the gene symbol then the expression value
    expressions = record[key]
    for symbol, _ in expressions:
        if prefix == "HGNC":
            current_id = hgnc_client.get_current_hgnc_id(symbol)
            # We may get no current IDs or more than one current IDs
            # in which case we skip this gene
            if not current_id or isinstance(current_id, list):
                identifier = None
            else:
                identifier = current_id
            _prefix = "HGNC"
        elif prefix == "MGI":
            _prefix, identifier = "UP", get_id_from_mgi_name(symbol)
        elif prefix == "RGD":
            _prefix, identifier = "UP", get_id_from_rgd_name(symbol)
        else:
            raise ValueError(f"Invalid prefix: {prefix} ! {symbol}")
        if identifier is None:
            if (prefix, symbol) not in MISSING_NAMES:
                logger.debug(
                    f"Could not look up {symbol} by name in {prefix}",
                )
                MISSING_NAMES.add((prefix, symbol))
            continue
        rv.append((_prefix, identifier, symbol))
    return rv
Esempio n. 6
0
def get_statements_for_kinase_db_api(kinase):
    logger.info('Getting statements for %s' % kinase)
    hgnc_id = hgnc_client.get_current_hgnc_id(kinase)
    if hgnc_id is None:
        logger.warning('Could not get HGNC ID for %s' % kinase)
        return None
    ip = get_statements(agents=['%s@HGNC' % hgnc_id], ev_limit=10000)
    stmts = filter_out_medscan(ip.statements)
    stmts = sorted(stmts, key=lambda x: len(x.evidence), reverse=True)
    return stmts
def get_hgnc_ids(gene_names):
    ids = []
    for gene in gene_names:
        if '.' in gene:
            print('%s is not an HGNC ID' % gene)
            continue
        hgnc_id = hgnc_client.get_current_hgnc_id(gene)
        if not hgnc_id:
            print('Invalid gene symbol: %s' % gene)
            continue
        ids.append(hgnc_id)
    return ids
Esempio n. 8
0
def align_identifiers_urls(indra_groundings, dm_urls):
    matches = []
    identifiers_prefix = 'https://identifiers.org/'
    for dm_url in dm_urls:
        # We do it this way instead of splitting because of DOIs which have
        # extra slashes
        entity = dm_url[len(identifiers_prefix):]
        db_ns, db_id = entity.split(':', maxsplit=1)
        if db_ns == 'CHEBI':
            db_refs = [
                standardize_db_refs({'CHEBI': '%s:%s' % (db_ns, db_id)})
            ]
        elif db_ns == 'hgnc':
            db_refs = [standardize_db_refs({'HGNC': db_id})]
        elif db_ns == 'hgnc.symbol':
            hgnc_id = hgnc_client.get_current_hgnc_id(db_id)
            db_refs = [standardize_db_refs({'HGNC': hgnc_id})]
        elif db_ns == 'pubchem.compound':
            db_refs = [standardize_db_refs({'PUBCHEM': db_id})]
        elif db_ns == 'uniprot':
            db_refs = [standardize_db_refs({'UP': db_id})]
        elif db_ns == 'bigg.metabolite':
            chebi_ids = bigg_to_chebi.get(db_id)
            if chebi_ids:
                db_refs = [
                    standardize_db_refs({'CHEBI': chebi_id})
                    for chebi_id in chebi_ids
                ]
            else:
                db_refs = [{}]
        elif db_ns == 'ncbigene':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(db_id)
            if hgnc_id:
                db_refs = [standardize_db_refs({'HGNC': hgnc_id})]
            else:
                db_refs = [{}]
        # Skip literature references that aren't entities
        elif db_ns in {'doi', 'pubmed'}:
            continue
        else:
            print('Unhandled namespace %s' % db_ns)
            db_refs = {}

        matched = None
        for db_ref in db_refs:
            for k, v in db_ref.items():
                if (k, v) in indra_groundings:
                    matched = (k, v)
                    break

        matches.append(
            (dm_url, get_identifiers_url(*matched) if matched else None))
    return matches
Esempio n. 9
0
 def add_famplex_hierarchy(self):
     from indra.databases import hgnc_client
     edges = []
     for row in read_unicode_csv(get_resource_path(
             os.path.join('famplex', 'relations.csv')), delimiter=','):
         ns1, id1, rel, ns2, id2 = row
         if ns1 == 'HGNC':
             id1 = hgnc_client.get_current_hgnc_id(id1)
         edges.append((self.label(ns1, id1),
                       self.label(ns2, id2),
                       {'type': rel}))
     self.add_edges_from(edges)
Esempio n. 10
0
def _hgncsym2up(hgnc_symb: str) -> str:
    hgnc_id = get_current_hgnc_id(hgnc_symb)
    if isinstance(hgnc_id, list):
        ix = 0
        upid = None
        while upid is None:
            try:
                upid = get_uniprot_id(hgnc_id[ix])
            except IndexError:
                break
            ix += 1
    else:
        upid = get_uniprot_id(hgnc_id)
    return upid
def _get_upid_from_hgnc_symbol(hgnc_gene: str) -> Union[str, None]:
    hgnc_id = get_current_hgnc_id(hgnc_gene)
    if isinstance(hgnc_id, list):
        ix = 0
        while True:
            try:
                up_id = get_uniprot_id(hgnc_id[ix])
            except IndexError:
                up_id = None
                break
            if up_id is None:
                ix += 1
    else:
        up_id = get_uniprot_id(hgnc_id)
    return up_id
Esempio n. 12
0
def get_grounded_agent(gene_name):
    """Return a grounded Agent based on an HGNC symbol."""
    db_refs = {'TEXT': gene_name}
    if gene_name in hgnc_map:
        gene_name = hgnc_map[gene_name]
    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
    if not hgnc_id:
        hgnc_id = hgnc_client.get_current_hgnc_id(gene_name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
    agent = Agent(gene_name, db_refs=db_refs)
    return agent
Esempio n. 13
0
def get_all_kinase_hgnc_ids():
    # RAS 220 genes
    hgnc_ids = []
    with open('../../idg_ion_channels/data/IDG_target_final.csv', 'r') as fh:
        reader = csv.reader(fh, delimiter=',')
        next(reader)
        gene_names = [row[0] for row in reader if row[1] == 'Kinase']
    hgnc_ids = []
    for gene_name in gene_names:
        hgnc_id = hgnc_client.get_current_hgnc_id(gene_name)
        if not hgnc_id:
            print('Could not get HGNC ID for %s' % gene_name)
        else:
            hgnc_ids.append(hgnc_id)
    return hgnc_ids
Esempio n. 14
0
def sanitize_hgnc_ids(raw_hgnc_ids):
    # First we get a list of primary IDs
    hgnc_ids = set()
    for raw_hgnc_id in raw_hgnc_ids:
        # Check if it's an ID first
        m1 = re.match('([0-9]+)', raw_hgnc_id)
        m2 = re.match('hgnc:([0-9]+)', raw_hgnc_id.lower())
        if m1:
            hgnc_id = str(m1.groups()[0])
            hgnc_ids.add(hgnc_id)
        elif m2:
            hgnc_id = str(m2.groups()[0])
            hgnc_ids.add(hgnc_id)
        # If not, we assume it's a symbol
        else:
            hgnc_id = hgnc_client.get_current_hgnc_id(raw_hgnc_id)
            if isinstance(hgnc_id, list):
                hgnc_ids |= set(hgnc_id)
            elif hgnc_id:
                hgnc_ids.add(hgnc_id)

    return list(hgnc_ids)
Esempio n. 15
0
def map_hgnc_symbols(hgnc_symbols):
    """Return references based on a list of HGNC symbols."""
    refs = []
    for hgnc_symbol in hgnc_symbols:
        ref = {'HGNC_SYMBOL': hgnc_symbol, 'HGNC': None, 'UP': None}
        hgnc_id = hgnc_client.get_current_hgnc_id(hgnc_symbol)
        if not hgnc_id:
            logger.warning('Could not get HGNC ID for symbol %s' % hgnc_symbol)
            continue
        elif isinstance(hgnc_id, list):
            logger.warning('More than one current HGNC ID for outdated '
                           'symbol %s' % hgnc_symbol)
            continue
        ref['HGNC'] = hgnc_id
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
        if not uniprot_id:
            logger.warning('Could not get UniProt ID for symbol %s' %
                           hgnc_symbol)
            continue
        ref['UP'] = uniprot_id
        refs.append(ref)
    return refs
Esempio n. 16
0
def get_stmts_for_gene(gene: str, max_stmts: int = 100000) -> List[Statement]:
    """Return all existing Statements for a given gene from the DB.

    Parameters
    ----------
    gene :
        The HGNC symbol of a gene to query.
    max_stmts:
        The maximum number of statements to return

    Returns
    -------
    :
        A list of INDRA Statements in which the given gene is involved.
    """
    hgnc_id = hgnc_client.get_current_hgnc_id(gene)
    if hgnc_id is None:
        return []
    agents = [
        (None, hgnc_id, "HGNC"),
    ]
    res = get_raw_stmt_jsons_from_agents(agents=agents, max_stmts=max_stmts)
    return stmts_from_json(res.values())
def up_for_hgnc(gene):
    """Return HGNC symbol and UniProt ID for a potentially outdated gene
    name."""
    hgnc_id = hgnc_client.get_current_hgnc_id(gene)
    if hgnc_id is None:
        #print("Couldn't find current HGNC ID for gene %s" % gene)
        hgnc_name = gene
        up_id = None
    elif isinstance(hgnc_id, list):
        #print("More than one HGNC ID for gene %s" % gene)
        hgnc_name = gene
        up_id = None
    else:
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        up_id_str = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id_str is None:
            #print("No Uniprot ID for HGNC ID %s, gene %s" % (hgnc_id, gene))
            up_id = None
        elif ',' in up_id_str:
            up_ids = [u.strip() for u in up_id_str.split(',')]
            up_id = up_ids[0]
        else:
            up_id = up_id_str
    return hgnc_name, up_id
Esempio n. 18
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s' %
                        (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        if name == 'cell proliferation':
            name = 'cell population proliferation'
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns == 'MGI':
        up_id = mouse_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    elif ns == 'RGD':
        up_id = rat_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    # Map Selventa families and complexes to FamPlex
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    elif ns == 'SCOMP':
        db_refs = {'SCOMP': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SCOMP complex: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info(
                    'HGNC entity %s with HGNC ID %s has no '
                    'corresponding Uniprot ID.', name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.debug('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # These appear in the name slot but are actually IDs
    elif ns == 'CHEBIID':
        chebi_id = identifiers.ensure_chebi_prefix(name)
        db_refs = {'CHEBI': chebi_id}
        name = chebi_client.get_chebi_name_from_id(chebi_id)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    return name, db_refs