Exemple #1
0
def match_reactome(z_sc, reactome_dict):
    logger.info('Generating generator')
    corr_iterator = corr_matrix_to_generator(z_sc)
    res = {
        'agA_hgnc': [],
        'agA_up': [],
        'agB_hgnc': [],
        'agB_up': [],
        'z_sc': [],
        'has_pathways': [],
        'common_pathways': []
    }
    logger.info('Looping correlations')
    for a, b, corr in corr_iterator:
        hgnc_id_a = get_current_hgnc_id(a)
        if isinstance(hgnc_id_a, list):
            ix = 0
            while True:
                try:
                    a_up = get_uniprot_id(hgnc_id_a[ix])
                except IndexError:
                    a_up = None
                    break
                if a_up is None:
                    ix += 1
        else:
            a_up = get_uniprot_id(hgnc_id_a)
        if a_up is None:
            continue

        hgnc_id_b = get_current_hgnc_id(b)
        if isinstance(hgnc_id_b, list):
            ix = 0
            while True:
                try:
                    b_up = get_uniprot_id(hgnc_id_b[ix])
                except IndexError:
                    b_up = None
                    break
                if b_up is None:
                    ix += 1
        else:
            b_up = get_uniprot_id(hgnc_id_b)
        if b_up is None:
            continue

        common_reactome = set(reactome_dict.get(a_up, [])) & \
                          set(reactome_dict.get(b_up, []))
        res['agA_hgnc'].append(a)
        res['agA_up'].append(a_up)
        res['agB_hgnc'].append(b)
        res['agB_up'].append(b_up)
        res['z_sc'].append(corr)
        res['common_pathways'].append(common_reactome)
        res['has_pathways'].append(bool(common_reactome))
    logger.info('Returning results')
    return res
Exemple #2
0
def read_phosphosite(fname):
    df = pandas.read_csv(fname, index_col=None)
    statements = []
    antibody_map = {}
    for _, row in df.iterrows():
        sub_upid = row['SUB_ID']
        if not pandas.isnull(sub_upid):
            sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid)
            sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
        else:
            sub_hgnc_symbol = row['SUB_GENE']
            sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
            sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id)
        sub = Agent(sub_hgnc_symbol,
                    db_refs={'UP': sub_upid,'HGNC': sub_hgnc})
        residue = row['Actual_site'][0]
        if len(row['Actual_site']) > 1:
            position = row['Actual_site'][1:]
        else:
            position = None

        sub_readout = deepcopy(sub)
        mc = ModCondition('phosphorylation', residue, position)
        sub_readout.mods = [mc]
        ps = row['phosphosite']
        if ps in antibody_map:
            found = False
            for p in antibody_map[ps]:
                if p.name == sub.name and p.mods[0].residue == residue and \
                    p.mods[0].position == position:
                    found = True
                    break
            if not found:
                antibody_map[ps].append(sub_readout)
        else:
            antibody_map[ps] = [sub_readout]

        kin_upid = row['KIN_ID']
        if not pandas.isnull(kin_upid):
            if not uniprot_client.is_human(kin_upid):
                print('%s non human' % kin_upid)
                continue
            kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid)
            kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
        else:
            kin_hgnc_symbol = row['KINASE_GENE_SYMBOL']
            kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
            kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id)
        kin = Agent(kin_hgnc_symbol,
                    db_refs={'UP': kin_upid,'HGNC': kin_hgnc})

        ev = Evidence(source_api='phosphosite')
        st = Phosphorylation(kin, sub, residue, position, evidence = [ev])
        statements.append(st)
    return statements, antibody_map
def _hgncsym2up(hgnc_symb: str) -> str:
    hgnc_id = get_current_hgnc_id(hgnc_symb)
    if isinstance(hgnc_id, list):
        ix = 0
        upid = None
        while upid is None:
            try:
                upid = get_uniprot_id(hgnc_id[ix])
            except IndexError:
                break
            ix += 1
    else:
        upid = get_uniprot_id(hgnc_id)
    return upid
def _get_upid_from_hgnc_symbol(hgnc_gene: str) -> Union[str, None]:
    hgnc_id = get_current_hgnc_id(hgnc_gene)
    if isinstance(hgnc_id, list):
        ix = 0
        while True:
            try:
                up_id = get_uniprot_id(hgnc_id[ix])
            except IndexError:
                up_id = None
                break
            if up_id is None:
                ix += 1
    else:
        up_id = get_uniprot_id(hgnc_id)
    return up_id
Exemple #5
0
 def _get_db_refs(bpe):
     db_refs = {}
     if _is_protein(bpe):
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         # Handle missing HGNC/UP ids
         if hgnc_id and not uniprot_id:
             uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
         if uniprot_id and not hgnc_id:
             if uniprot_client.is_human(uniprot_id):
                 hgnc_name = uniprot_client.get_gene_name(uniprot_id, False)
                 if hgnc_name:
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     elif _is_small_molecule(bpe):
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
     else:
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     return db_refs
Exemple #6
0
def agent_from_gene_name(name):
    """Return a grounded Agent based on a gene name."""
    agent = Agent(name)
    hgnc_id = hgnc_client.get_hgnc_id(name)
    uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    agent.db_refs = {'HGNC': hgnc_id, 'UP': uniprot_id}
    return agent
def get_mappings() -> Iterable[PredictionTuple]:
    """Iterate high-confidence lexical mappings between MeSH and UniProt human proteins."""
    url = get_script_url(__file__)
    mapping_type = "lexical"
    match_type = "skos:exactMatch"
    confidence = 0.999
    for mesh_name, mesh_id in mesh_client.mesh_name_to_id.items():
        match = MESH_PROTEIN_RE.match(mesh_name)
        if not match:
            continue
        gene_name = match.groups()[0]
        hgnc_id = hgnc_client.get_hgnc_id(gene_name)
        if not hgnc_id:
            continue
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
        if not uniprot_id or "," in uniprot_id:
            continue
        yield PredictionTuple(
            "mesh",
            mesh_id,
            mesh_name,
            match_type,
            "uniprot",
            uniprot_id,
            gene_name,
            mapping_type,
            confidence,
            url,
        )
Exemple #8
0
def agent_from_gene_name(gene_name):
    """Return an Agent based on a gene name."""
    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
    up_id = hgnc_client.get_uniprot_id(hgnc_id)
    agent = Agent(gene_name, db_refs={'HGNC': hgnc_id,
                                      'UP': up_id})
    return agent
Exemple #9
0
def update_kinases():
    logger.info('--Updating kinase list------')
    url = 'http://www.uniprot.org/uniprot/?' + \
        'sort=entry_name&desc=no&compress=no&query=database:(type:' + \
        'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \
        '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \
        '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name'
    fname = os.path.join(path, 'kinases.tsv')
    save_from_http(url, fname)

    from indra.databases import hgnc_client, uniprot_client
    add_kinases = [
        'PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4',
        'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE'
    ]
    df = pandas.read_csv(fname, sep='\t')
    for kinase in add_kinases:
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        up_mnemonic = uniprot_client.get_mnemonic(up_id)
        df = df.append(
            {
                'Entry': up_id,
                'Gene names  (primary )': kinase,
                'Organism ID': '9606',
                'Entry name': up_mnemonic
            },
            ignore_index=True)
    df.to_csv(fname, sep='\t', index=False)
Exemple #10
0
 def standardize_agent_db_refs(agent, map_db_refs, do_rename=True):
     gene_name = None
     up_id = map_db_refs.get('UP')
     hgnc_sym = map_db_refs.get('HGNC')
     if up_id and not hgnc_sym:
         gene_name = uniprot_client.get_gene_name(up_id, False)
         if gene_name:
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             if hgnc_id:
                 map_db_refs['HGNC'] = hgnc_id
     elif hgnc_sym and not up_id:
         # Override the HGNC symbol entry from the grounding
         # map with an HGNC ID
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
         if hgnc_id:
             map_db_refs['HGNC'] = hgnc_id
             # Now get the Uniprot ID for the gene
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 map_db_refs['UP'] = up_id
         # If there's no HGNC ID for this symbol, raise an
         # Exception
         else:
             raise ValueError('No HGNC ID corresponding to gene '
                              'symbol %s in grounding map.' % hgnc_sym)
     # If we have both, check the gene symbol ID against the
     # mapping from Uniprot
     elif up_id and hgnc_sym:
         # Get HGNC Symbol from Uniprot
         gene_name = uniprot_client.get_gene_name(up_id)
         if not gene_name:
             raise ValueError('No gene name found for Uniprot '
                              'ID %s (expected %s)' % (up_id, hgnc_sym))
         # We got gene name, compare it to the HGNC name
         else:
             if gene_name != hgnc_sym:
                 raise ValueError('Gene name %s for Uniprot ID '
                                  '%s does not match HGNC '
                                  'symbol %s given in grounding '
                                  'map.' % (gene_name, up_id, hgnc_sym))
             else:
                 hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                 if not hgnc_id:
                     logger.error('No HGNC ID corresponding to gene '
                                  'symbol %s in grounding map.' % hgnc_sym)
                 else:
                     map_db_refs['HGNC'] = hgnc_id
     # Assign the DB refs from the grounding map to the agent
     agent.db_refs = map_db_refs
     # Are we renaming right now?
     if do_rename:
         # If there's a FamPlex ID, prefer that for the name
         if agent.db_refs.get('FPLX'):
             agent.name = agent.db_refs.get('FPLX')
         # Get the HGNC symbol or gene name (retrieved above)
         elif hgnc_sym is not None:
             agent.name = hgnc_sym
         elif gene_name is not None:
             agent.name = gene_name
     return
Exemple #11
0
    def _make_db_refs(self, entrez_id, text_id):
        """Looks up the HGNC ID  and name, as well as the Uniprot ID.

        Parameters
        ----------
        entrez_id : str
            Entrez gene ID.
        text_id : str or None
            A plain text systematic name, or None if not listed in the
            Biogrid data.

        Returns
        -------
        hgnc_name : str
            Official HGNC symbol for the gene.
        db_refs : dict
            db_refs grounding dictionary, used when constructing the Agent
            object.
        """
        db_refs = {}
        if text_id != '-' and text_id is not None:
            db_refs['TEXT'] = text_id

        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
        return (hgnc_name, db_refs)
Exemple #12
0
    def _make_db_refs(self, entrez_id, text_id):
        """Looks up the HGNC ID  and name, as well as the Uniprot ID.

        Parameters
        ----------
        entrez_id : str
            Entrez gene ID.
        text_id : str or None
            A plain text systematic name, or None if not listed in the
            Biogrid data.

        Returns
        -------
        hgnc_name : str
            Official HGNC symbol for the gene.
        db_refs : dict
            db_refs grounding dictionary, used when constructing the Agent
            object.
        """
        db_refs = {}
        if text_id != '-' and text_id is not None:
            db_refs['TEXT'] = text_id

        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
        return (hgnc_name, db_refs)
Exemple #13
0
 def _add_node(self, agent):
     node_key = agent.name
     node_id = self._existing_nodes.get(node_key)
     if node_id is not None:
         return node_id
     db_refs = _get_db_refs(agent)
     node_id = self._get_new_id()
     self._existing_nodes[node_key] = node_id
     node_name = agent.name
     node_name = node_name.replace('_', ' ')
     expanded_families = expander.get_children(agent, ns_filter='HGNC')
     members = {}
     for member in expanded_families:
         hgnc_symbol = member[1]
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
         if hgnc_id:
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             member_agent = Agent(hgnc_symbol,
                                  db_refs={'HGNC': hgnc_id,
                                           'UP': up_id})
             member_db_refs = _get_db_refs(member_agent)
         else:
             member_db_refs = {}
         members[member[1]] = {
                 'mutation': None,
                 'expression': None,
                 'db_refs': member_db_refs
                 }
     node = {'data': {'id': node_id, 'name': node_name,
                      'db_refs': db_refs, 'parent': '',
                      'members': members}}
     self._nodes.append(node)
     return node_id
Exemple #14
0
def _get_up_id(hgnc_id):
    hgnc_id = str(hgnc_id)
    up_id = hgnc_client.get_uniprot_id(hgnc_id)
    if not up_id:
        logger.info("No Uniprot ID for HGNC ID %s" % hgnc_id)
        return None
    if ',' in up_id:
        return None
    return up_id
Exemple #15
0
def get_target_agent(target):
    target_hgnc_id = hgnc_client.get_hgnc_id(target)
    target_up_id = hgnc_client.get_uniprot_id(target_hgnc_id)
    target_agent = Agent(target,
                         db_refs={
                             'HGNC': target_hgnc_id,
                             'UP': target_up_id
                         })
    return target_agent
Exemple #16
0
 def _get_agent_from_gene_name(gene_name):
     db_refs = {}
     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
     if hgnc_id:
         db_refs['HGNC'] = hgnc_id
         up_id = hgnc_client.get_uniprot_id(hgnc_id)
         if up_id:
             db_refs['UP'] = up_id
     agent = Agent(gene_name, db_refs=db_refs)
     return agent
Exemple #17
0
def get_agent(raw_name, entrez_id):
    db_refs = {'TEXT': raw_name}
    logger.debug('Looking up grounding data for Entrez #%s' % entrez_id)
    hgnc_id = hgc.get_hgnc_from_entrez(entrez_id)
    if hgnc_id is not None:
        db_refs['UP'] = hgc.get_uniprot_id(hgnc_id)
        name = hgc.get_hgnc_name(hgnc_id)
    else:
        name = raw_name
    agent = Agent(name, db_refs=db_refs)
    return agent
Exemple #18
0
def _fix_agent(agent):
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Change FA name space
        if db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
    agent.db_refs = db_refs_tmp
    # Check if we have a BE entry
    be_id = agent.db_refs.get('BE')
    # Try to map to BE from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = bioentities_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # BE takes precedence if we have it
    if be_id:
        agent.db_refs['BE'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                agent.db_refs['UP'] = up_id
    elif up_id:
        gene_name = uniprot_client.get_gene_name(up_id)
        if gene_name:
            agent.name = gene_name
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                agent.db_refs['HGNC'] = hgnc_id
Exemple #19
0
 def _extract_protein(self, name, gene_id):
     refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         refs['HGNC'] = hgnc_id
         up_id = hgnc_client.get_uniprot_id(hgnc_id)
         if up_id:
             refs['UP'] = up_id
         # If there is a HGNC ID, we standardize the gene name
         name = hgnc_client.get_hgnc_name(hgnc_id)
     return Agent(name, db_refs=refs)
Exemple #20
0
 def _get_db_refs(bpe):
     db_refs = {}
     if _is_protein(bpe) or _is_rna(bpe):
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         # Handle missing HGNC/UP ids
         if hgnc_id and not uniprot_id:
             uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
         elif uniprot_id and not hgnc_id:
             if uniprot_client.is_human(uniprot_id):
                 hgnc_name = uniprot_client.get_gene_name(uniprot_id, False)
                 if hgnc_name:
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
         # If we have both an HGNC ID and a Uniprot ID, override the
         # Uniprot ID with the one associated with the HGNC ID
         elif uniprot_id and hgnc_id:
             hgnc_up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if hgnc_up_id != uniprot_id:
                 logger.info('Uniprot ID %s does not match %s obtained '
                             'from HGNC ID %s' %
                             (uniprot_id, hgnc_up_id, hgnc_id))
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     elif _is_small_molecule(bpe):
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
     else:
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     return db_refs
Exemple #21
0
def run_msa(gene_dict, rs_data, problems):
    # Next, get sequences and run alignments
    counter = 0
    matches = set()
    aln_data = {}
    for gene_sym, rs_ids in gene_dict.items():
        counter += 1
        #if counter >= 20:
        #    break
        print("%s: %d of %d genes" % (gene_sym, counter, len(gene_dict)))
        fasta_lines = []
        # Get the main Uniprot sequence from the gene symbol
        hgnc_id = hgnc_client.get_hgnc_id(gene_sym)
        up_id_main = hgnc_client.get_uniprot_id(hgnc_id)
        up_sequence = uniprot_client.get_sequence(up_id_main)
        fasta_lines.append('>%s\n' % gene_sym)
        fasta_lines.append('%s\n' % up_sequence)

        # Now, iterate over the refseq ids and get the sequences
        seq_ids = []
        # The filenames to use if we do an alignment
        in_file = 'aln/in/%s.fasta' % gene_sym
        out_file = 'aln/out/%s.fasta' % gene_sym
        # Iterate over the Refseq IDs
        for rs_id in rs_ids:
            seq_info = rs_data.get(rs_id)
            if not seq_info:
                problems.add((rs_id, 'no sequence in Refseq'))
                continue
            seq_ids.append(rs_id)
            fasta_header, sequence = seq_info
            fasta_lines.append('>%s\n%s\n' % (rs_id, sequence))
            if sequence == up_sequence:
                aln_data[rs_id] = (gene_sym, True, None)
            else:
                aln_data[rs_id] = (gene_sym, False, out_file)
        if len(seq_ids) == 0:
            continue

        if len(seq_ids) == 1 and sequence == up_sequence:
            print("\tAll sequences match, no alignment needed.")
            continue
        else:
            # Write the fasta file
            with open(in_file, 'wt') as f:
                for line in fasta_lines:
                    f.write(line)
            # Run the sequence alignment
            print("\tRunning sequence alignment.")
            subprocess.call(['./clustal-omega-1.2.3-macosx', '-i', in_file,
                             '-o', out_file, '--force'])
    return aln_data
Exemple #22
0
def get_grounded_agent(gene_name):
    """Return a grounded Agent based on an HGNC symbol."""
    db_refs = {'TEXT': gene_name}
    if gene_name in hgnc_map:
        gene_name = hgnc_map[gene_name]
    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
    agent = Agent(gene_name, db_refs=db_refs)
    return agent
Exemple #23
0
def get_grounded_agent(gene_name):
    """Return a grounded Agent based on an HGNC symbol."""
    db_refs = {'TEXT': gene_name}
    if gene_name in hgnc_map:
        gene_name = hgnc_map[gene_name]
    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
    agent = Agent(gene_name, db_refs=db_refs)
    return agent
Exemple #24
0
def _agent_from_ns_id(ag_ns, ag_id):
    ag_name = ag_id
    db_refs = {'TEXT': ag_name}
    if ag_ns == 'HGNC':
        hgnc_id = hgnc_client.get_hgnc_id(ag_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
    else:
        if ag_id is not None:
            db_refs[ag_ns] = ag_id
    return Agent(ag_name, db_refs=db_refs)
def get_mappings():
    url = get_script_url()
    mapping_type = 'lexical'
    match_type = 'skos:exactMatch'
    for mesh_name, mesh_id in mesh_client.mesh_name_to_id.items():
        match = re.match(r'^(.+) protein, human$', mesh_name)
        if match:
            gene_name = match.groups()[0]
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
                if uniprot_id:
                    yield ('mesh', mesh_id, mesh_name, match_type, 'uniprot',
                           uniprot_id, gene_name, mapping_type, url)
Exemple #26
0
def _agent_from_ns_id(ag_ns, ag_id):
    ag_name = ag_id
    db_refs = {'TEXT': ag_name}
    if ag_ns == 'HGNC':
        hgnc_id = hgnc_client.get_hgnc_id(ag_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
    else:
        if ag_id is not None:
            db_refs[ag_ns] = ag_id
    return Agent(ag_name, db_refs=db_refs)
def get_gene_agents(gene_names):
    agents = []
    for gn in gene_names:
        hgnc_id = hgnc_client.get_hgnc_id(gn)
        if not hgnc_id:
            logger.warning('Invalid HGNC gene symbol: %s' % gn)
            continue
        db_refs = {'HGNC': hgnc_id}
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        agent = Agent(gn, db_refs=db_refs)
        agents.append(agent)
    return agents
Exemple #28
0
def get_gene_agents(gene_names):
    agents = []
    for gn in gene_names:
        hgnc_id = hgnc_client.get_hgnc_id(gn)
        if not hgnc_id:
            logger.warning('Invalid HGNC gene symbol: %s' % gn)
            continue
        db_refs = {'HGNC': hgnc_id}
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        agent = Agent(gn, db_refs=db_refs)
        agents.append(agent)
    return agents
Exemple #29
0
def _refs_from_hgnc_id(hgnc_id):
    ref = {'HGNC_SYMBOL': None, 'HGNC': hgnc_id, 'UP': None}
    hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
    if not hgnc_name:
        logger.warning('Could not get HGNC name for ID %s' %
                       hgnc_id)
        return None
    ref['HGNC_SYMBOL'] = hgnc_name
    uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    if not uniprot_id:
        logger.warning('Could not get UniProt ID for HGNC ID %s' %
                       hgnc_id)
        return None
    ref['UP'] = uniprot_id
    return ref
Exemple #30
0
 def normalize_mutation_count(gene_name, num_muts):
     hgnc_id = get_hgnc_id(gene_name)
     up_id = get_uniprot_id(hgnc_id)
     if not up_id:
         logger.warning("Could not get Uniprot ID for HGNC symbol %s "
                        "with HGNC ID %s" % (gene_name, hgnc_id))
         length = 500 # a guess at a default
     else:
         length = uniprot_client.get_length(up_id)
         if not length:
             logger.warning("Could not get length for Uniprot "
                            "ID %s" % up_id)
             length = 500 # a guess at a default
     norm_mutations = num_muts / float(length)
     return norm_mutations
Exemple #31
0
 def get_db_refs(egid):
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     if not hgnc_id:
         logger.info("No HGNC ID for Entrez ID: %s" % egid)
         return (None, {})
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     if not hgnc_name:
         logger.info("No HGNC name for HGNC ID: %s" % hgnc_id)
         return (None, {})
     up_id = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id:
         logger.info("No Uniprot ID for EGID / HGNC ID / Symbol "
                     "%s / %s / %s" % (egid, hgnc_id, hgnc_name))
         return (None, {})
     return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
Exemple #32
0
 def get_db_refs(egid):
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     if not hgnc_id:
         logger.info("No HGNC ID for Entrez ID: %s" % egid)
         return (None, {})
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     if not hgnc_name:
         logger.info("No HGNC name for HGNC ID: %s" % hgnc_id)
         return (None, {})
     up_id = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id:
         logger.info("No Uniprot ID for EGID / HGNC ID / Symbol "
                     "%s / %s / %s" % (egid, hgnc_id, hgnc_name))
         return (None, {})
     return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
Exemple #33
0
    def get_mutated_genes(self):
        """Return dict of gene mutation frequencies based on TCGA studies."""
        if self.mutation_cache:
            logger.info('Loading mutations from %s' % self.mutation_cache)
            with open(self.mutation_cache, 'r') as fh:
                self.mutations = json.load(fh)
        else:
            logger.info('Getting mutations from cBio web service')
            mutations = {}
            for tcga_study_name in tcga_studies[self.tcga_study_prefix]:
                for idx, hgnc_name_batch in \
                                enumerate(batch_iter(hgnc_ids.keys(), 200)):
                    logger.info('Fetching mutations for %s and gene batch %s' %
                                (tcga_study_name, idx))
                    patient_mutations = \
                        cbio_client.get_profile_data(tcga_study_name,
                                                     hgnc_name_batch,
                                                     'mutation')
                    # e.g. 'ICGC_0002_TD': {'BRAF': None, 'KRAS': 'G12D'}
                    for patient, gene_mut_dict in patient_mutations.items():
                        # 'BRAF': None
                        for gene, mutated in gene_mut_dict.items():
                            if mutated is not None:
                                try:
                                    mutations[gene] += 1
                                except KeyError:
                                    mutations[gene] = 1
            self.mutations = mutations

        # Normalize mutations by length
        self.norm_mutations = {}
        for gene_name, num_muts in self.mutations.items():
            hgnc_id = get_hgnc_id(gene_name)
            up_id = get_uniprot_id(hgnc_id)
            if not up_id:
                logger.warning("Could not get Uniprot ID for HGNC symbol %s "
                               "with HGNC ID %s" % (gene_name, hgnc_id))
                length = 500  # a guess at a default
            else:
                length = uniprot_client.get_length(up_id)
                if not length:
                    logger.warning("Could not get length for Uniprot "
                                   "ID %s" % up_id)
                    length = 500  # a guess at a default
            self.norm_mutations[gene_name] = num_muts / float(length)

        return self.mutations, self.norm_mutations
Exemple #34
0
 def _add_node(self, agent, uuid=None):
     node_key = agent.name
     node_id = self._existing_nodes.get(node_key)
     # if the node already exists we do not want to add it again
     # we must however add its uuid
     if node_id is not None:
         # fetch the appropriate node
         n = [x for x in self._nodes if x['data']['id'] == node_id][0]
         uuid_list = n['data']['uuid_list']
         if uuid not in uuid_list:
             uuid_list.append(uuid)
         return node_id
     db_refs = _get_db_refs(agent)
     node_id = self._get_new_id()
     self._existing_nodes[node_key] = node_id
     node_name = agent.name
     node_name = node_name.replace('_', ' ')
     expanded_families = expander.get_children(agent, ns_filter='HGNC')
     members = {}
     for member in expanded_families:
         hgnc_symbol = member[1]
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
         if hgnc_id:
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             member_agent = Agent(hgnc_symbol,
                                  db_refs={
                                      'HGNC': hgnc_id,
                                      'UP': up_id
                                  })
             member_db_refs = _get_db_refs(member_agent)
         else:
             member_db_refs = {}
         members[member[1]] = {'db_refs': member_db_refs}
     node = {
         'data': {
             'id': node_id,
             'name': node_name,
             'db_refs': db_refs,
             'parent': '',
             'members': members,
             'uuid_list': [uuid]
         }
     }
     self._nodes.append(node)
     return node_id
Exemple #35
0
def map_hgnc_symbols(hgnc_symbols):
    """Return references based on a list of HGNC symbols."""
    refs = []
    for hgnc_symbol in hgnc_symbols:
        ref = {'HGNC_SYMBOL': hgnc_symbol, 'HGNC': None, 'UP': None}
        hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
        if not hgnc_id:
            logger.warning('Could not get HGNC ID for symbol %s' % hgnc_symbol)
            continue
        ref['HGNC'] = hgnc_id
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
        if not uniprot_id:
            logger.warning('Could not get UniProt ID for symbol %s' %
                           hgnc_symbol)
            continue
        ref['UP'] = uniprot_id
        refs.append(ref)
    return refs
Exemple #36
0
 def _initialize_node_agents(self):
     """Initialize internal dicts containing node information."""
     nodes = _get_dict_from_list('nodes', self.cx)
     invalid_genes = []
     for node in nodes:
         id = node['@id']
         cx_db_refs = self.get_aliases(node)
         node_name = node['n']
         up_id = cx_db_refs.get('UP')
         if up_id:
             db_refs = {'UP': up_id, 'TEXT': node_name}
             hgnc_id = uniprot_client.get_hgnc_id(up_id)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
                 gene_name = hgnc_client.get_hgnc_name(hgnc_id)
             else:
                 gene_name = uniprot_client.get_gene_name(up_id)
             agent = Agent(gene_name, db_refs=db_refs)
             self._node_names[id] = gene_name
             self._node_agents[id] = agent
             continue
         else:
             self._node_names[id] = node_name
             hgnc_id = hgnc_client.get_hgnc_id(node_name)
             db_refs = {'TEXT': node_name}
             if not hgnc_id:
                 if not self.require_grounding:
                     self._node_agents[id] = \
                             Agent(node_name, db_refs=db_refs)
                 invalid_genes.append(node_name)
             else:
                 db_refs.update({'HGNC': hgnc_id})
                 up_id = hgnc_client.get_uniprot_id(hgnc_id)
                 # It's possible that a valid HGNC ID will not have a
                 # Uniprot ID, as in the case of HOTAIR (HOX transcript
                 # antisense RNA, HGNC:33510)
                 if up_id:
                     db_refs.update({'UP': up_id})
                 self._node_agents[id] = Agent(node_name, db_refs=db_refs)
     if invalid_genes:
         verb = 'Skipped' if self.require_grounding else 'Included'
         logger.info('%s invalid gene symbols: %s' %
                     (verb, ', '.join(invalid_genes)))
Exemple #37
0
def get_phospho_antibody_map(fname=antibody_map_file):
    # First gather the annotations for the phosphosites
    df = pandas.read_csv(fname, index_col=None, sep=',', encoding='utf8')
    antibody_map = {}

    for _, row in df.iterrows():
        ps = row['phosphosite']
        sub_upid = row['SUB_ID']
        if not pandas.isnull(sub_upid):
            if sub_upid.find('-') != -1:
                sub_upid = sub_upid.split('-')[0]
            sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid)
            sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
        else:
            sub_hgnc_symbol = row['SUB_GENE']
            sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
            sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id)
            if sub_upid is None:
                continue
        sub = Agent(sub_hgnc_symbol,
                    db_refs={
                        'UP': sub_upid,
                        'HGNC': sub_hgnc
                    })
        residue = row['Actual_site'][0]
        if len(row['Actual_site']) > 1:
            position = row['Actual_site'][1:]
        else:
            position = None
        mc = ModCondition('phosphorylation', residue, position)
        sub.mods = [mc]
        if ps in antibody_map:
            found = False
            for p in antibody_map[ps]:
                if p.name == sub.name and p.mods[0].residue == residue and \
                    p.mods[0].position == position:
                    found = True
                    break
            if not found:
                antibody_map[ps].append(sub)
        else:
            antibody_map[ps] = [sub]
    return antibody_map
Exemple #38
0
def get_genes_to_refseq_ids(problems):
    # First, collect refseq IDs for each gene
    gene_dict = {}
    for row in read_unicode_csv(peptide_file, delimiter='\t', skiprows=1):
        site_id = row[0]
        gene_sym, rem = site_id.split('.', maxsplit=1)
        refseq_id, site_info = rem.split(':')
        if gene_sym not in gene_dict:
            hgnc_id = hgnc_client.get_hgnc_id(gene_sym)
            if not hgnc_id:
                problems.add((refseq_id, 'invalid gene symbol'))
                continue
            up_id_main = hgnc_client.get_uniprot_id(hgnc_id)
            if not up_id_main or ', ' in up_id_main:
                problems.add((refseq_id, 'could not get Uniprot ID from HGNC'))
                continue
            gene_dict[gene_sym] = set([refseq_id])
        else:
            gene_dict[gene_sym].add(refseq_id)
    return gene_dict
Exemple #39
0
def _get_uniprot_id(agent):
    """Get the Uniprot ID for an agent, looking up in HGNC if necessary.

    If the Uniprot ID is a list then return the first ID by default.
    """
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    if up_id is None:
        if hgnc_id is None:
            # If both UniProt and HGNC refs are missing we can't
            # sequence check and so don't report a failure.
            return None
        # Try to get UniProt ID from HGNC
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        # If this fails, again, we can't sequence check
        if up_id is None:
            return None
    # If the UniProt ID is a list then choose the first one.
    if not isinstance(up_id, basestring) and \
       isinstance(up_id[0], basestring):
        up_id = up_id[0]
    return up_id
Exemple #40
0
def update_kinases():
    logger.info('--Updating kinase list------')
    url = 'http://www.uniprot.org/uniprot/?' + \
        'sort=entry_name&desc=no&compress=no&query=database:(type:' + \
        'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \
        '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \
        '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name'
    fname = os.path.join(path, 'kinases.tsv')
    save_from_http(url, fname)

    from indra.databases import hgnc_client, uniprot_client
    add_kinases = ['PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2',
                   'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE']
    df = pandas.read_csv(fname, sep='\t')
    for kinase in add_kinases:
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        up_mnemonic = uniprot_client.get_mnemonic(up_id)
        df = df.append({'Entry': up_id, 'Gene names  (primary )': kinase,
                        'Organism ID': '9606', 'Entry name': up_mnemonic},
                       ignore_index=True)
    df.to_csv(fname, sep='\t', index=False)
Exemple #41
0
 def _initialize_node_agents(self):
     """Initialize internal dicts containing node information."""
     nodes = _get_dict_from_list('nodes', self.cx)
     invalid_genes = []
     for node in nodes:
         id = node['@id']
         cx_db_refs = self.get_aliases(node)
         up_id = cx_db_refs.get('UP')
         if up_id:
             gene_name = uniprot_client.get_gene_name(up_id)
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             db_refs = {'UP': up_id, 'HGNC': hgnc_id, 'TEXT': gene_name}
             agent = Agent(gene_name, db_refs=db_refs)
             self._node_names[id] = gene_name
             self._node_agents[id] = agent
             continue
         else:
             node_name = node['n']
             self._node_names[id] = node_name
             hgnc_id = hgnc_client.get_hgnc_id(node_name)
             db_refs = {'TEXT': node_name}
             if not hgnc_id:
                 if not self.require_grounding:
                     self._node_agents[id] = \
                             Agent(node_name, db_refs=db_refs)
                 invalid_genes.append(node_name)
             else:
                 db_refs.update({'HGNC': hgnc_id})
                 up_id = hgnc_client.get_uniprot_id(hgnc_id)
                 # It's possible that a valid HGNC ID will not have a
                 # Uniprot ID, as in the case of HOTAIR (HOX transcript
                 # antisense RNA, HGNC:33510)
                 if up_id:
                     db_refs.update({'UP': up_id})
                 self._node_agents[id] = Agent(node_name, db_refs=db_refs)
     if invalid_genes:
         verb = 'Skipped' if self.require_grounding else 'Included'
         logger.info('%s invalid gene symbols: %s' %
                     (verb, ', '.join(invalid_genes)))
Exemple #42
0
 def _add_node(self, agent, uuid=None):
     node_key = agent.name
     node_id = self._existing_nodes.get(node_key)
     # if the node already exists we do not want to add it again
     # we must however add its uuid
     if node_id is not None:
         # fetch the appropriate node
         n = [x for x in self._nodes if x['data']['id'] == node_id][0]
         uuid_list = n['data']['uuid_list']
         if uuid not in uuid_list:
             uuid_list.append(uuid)
         return node_id
     db_refs = _get_db_refs(agent)
     node_id = self._get_new_id()
     self._existing_nodes[node_key] = node_id
     node_name = agent.name
     node_name = node_name.replace('_', ' ')
     expanded_families = expander.get_children(agent, ns_filter='HGNC')
     members = {}
     for member in expanded_families:
         hgnc_symbol = member[1]
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
         if hgnc_id:
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             member_agent = Agent(hgnc_symbol,
                                  db_refs={'HGNC': hgnc_id,
                                           'UP': up_id})
             member_db_refs = _get_db_refs(member_agent)
         else:
             member_db_refs = {}
         members[member[1]] = {'db_refs': member_db_refs}
     node = {'data': {'id': node_id, 'name': node_name,
                      'db_refs': db_refs, 'parent': '',
                      'members': members, 'uuid_list': [uuid]}}
     self._nodes.append(node)
     return node_id
Exemple #43
0
def test_get_uniprot_id():
    hgnc_id = '6840'
    uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    assert(uniprot_id == 'Q02750')
Exemple #44
0
    def _get_agent_from_entity(self, entity_id):
        qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id
        res = self.tree.execute(qstr)
        if res is None:
            return None
        try:
            entity_term = next(res)
        except StopIteration:
            logger.debug(' %s is not an entity' % entity_id)
            return None
        # This is the default name, which can be overwritten 
        # below for specific database entries
        agent_name = self._get_valid_name(entity_term['text'])
        db_refs = {}
        for xr in entity_term['xrefs']:
            ns = xr['namespace']
            if ns == 'uniprot':
                up_id = xr['id']
                db_refs['UP'] = up_id
                # Look up official names in UniProt
                gene_name = up_client.get_gene_name(up_id)
                if gene_name is not None:
                    agent_name = self._get_valid_name(gene_name)
                    # If the gene name corresponds to an HGNC ID, add it to the
                    # db_refs
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif ns == 'hgnc':
                hgnc_id = xr['id']
                db_refs['HGNC'] = hgnc_id
                # Look up the standard gene symbol and set as name
                hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
                if hgnc_name:
                    agent_name = hgnc_name
                # Look up the corresponding uniprot id
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
            elif ns == 'pfam':
                be_id = bioentities_map.get(('PF', xr['id']))
                if be_id:
                    db_refs['BE'] = be_id
                db_refs['PF'] = xr['id']
            elif ns == 'interpro':
                be_id = bioentities_map.get(('IP', xr['id']))
                if be_id:
                    db_refs['BE'] = be_id
                db_refs['PF'] = xr['id']
            elif ns == 'chebi':
                db_refs['CHEBI'] = xr['id']
            elif ns == 'pubchem':
                db_refs['PUBCHEM'] = 'PUBCHEM:%s' % xr['id']
            elif ns == 'go':
                db_refs['GO'] = xr['id']
            elif ns == 'mesh':
                db_refs['MESH'] = xr['id']
            elif ns == 'hmdb':
                db_refs['HMDB'] = xr['id']
            elif ns == 'simple_chemical':
                if xr['id'].startswith('HMDB'):
                    db_refs['HMDB'] = xr['id']
            elif ns == 'be':
                db_refs['BE'] = xr['id']
            # These name spaces are ignored
            elif ns in ['uaz']:
                pass
            else:
                logger.warning('Unhandled xref namespace: %s' % ns)
        db_refs['TEXT'] = entity_term['text']

        mod_terms = entity_term.get('modifications')
        mods = []
        muts = []
        if mod_terms is not None:
            for m in mod_terms:
                if m['type'].lower() == 'mutation':
                    # Evidence is usualy something like "V600E"
                    # We could parse this to get the amino acid
                    # change that happened.
                    mutation_str = m.get('evidence')
                    # TODO: sometimes mutation_str is "mutant", "Mutant",
                    # "mutants" - this indicates that there is a mutation
                    # but not the specific type. We should encode this
                    # somehow as a "blank" mutation condition
                    mut = self._parse_mutation(mutation_str)
                    if mut is not None:
                        muts.append(mut)
                else:
                    mc = self._get_mod_condition(m)
                    if mc is not None:
                        mods.append(mc)

        agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts)
        return agent
Exemple #45
0
def test_get_uniprot_id():
    hgnc_id = '6840'
    uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    assert uniprot_id == 'Q02750'
    assert unicode_strs(uniprot_id)
Exemple #46
0
def test_get_uniprot_id_none():
    # This HGNC entry doesn't have a UniProt ID
    hgnc_id = '12027'
    uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    assert uniprot_id is None
Exemple #47
0
 def standardize_agent_db_refs(agent, map_db_refs, do_rename=True):
     gene_name = None
     up_id = map_db_refs.get('UP')
     hgnc_sym = map_db_refs.get('HGNC')
     if up_id and not hgnc_sym:
         gene_name = uniprot_client.get_gene_name(up_id, False)
         if gene_name:
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             if hgnc_id:
                 map_db_refs['HGNC'] = hgnc_id
     elif hgnc_sym and not up_id:
         # Override the HGNC symbol entry from the grounding
         # map with an HGNC ID
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
         if hgnc_id:
             map_db_refs['HGNC'] = hgnc_id
             # Now get the Uniprot ID for the gene
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 map_db_refs['UP'] = up_id
         # If there's no HGNC ID for this symbol, raise an
         # Exception
         else:
             raise ValueError('No HGNC ID corresponding to gene '
                              'symbol %s in grounding map.' %
                              hgnc_sym)
     # If we have both, check the gene symbol ID against the
     # mapping from Uniprot
     elif up_id and hgnc_sym:
         # Get HGNC Symbol from Uniprot
         gene_name = uniprot_client.get_gene_name(up_id)
         if not gene_name:
             raise ValueError('No gene name found for Uniprot '
                              'ID %s (expected %s)' %
                              (up_id, hgnc_sym))
         # We got gene name, compare it to the HGNC name
         else:
             if gene_name != hgnc_sym:
                 raise ValueError('Gene name %s for Uniprot ID '
                                  '%s does not match HGNC '
                                  'symbol %s given in grounding '
                                  'map.' %
                                  (gene_name, up_id, hgnc_sym))
             else:
                 hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                 if not hgnc_id:
                     logger.error('No HGNC ID corresponding to gene '
                                  'symbol %s in grounding map.' % hgnc_sym)
                 else:
                     map_db_refs['HGNC'] = hgnc_id
     # Assign the DB refs from the grounding map to the agent
     agent.db_refs = map_db_refs
     # Are we renaming right now?
     if do_rename:
         # If there's a FamPlex ID, prefer that for the name
         if agent.db_refs.get('FPLX'):
             agent.name = agent.db_refs.get('FPLX')
         # Get the HGNC symbol or gene name (retrieved above)
         elif hgnc_sym is not None:
             agent.name = hgnc_sym
         elif gene_name is not None:
             agent.name = gene_name
     return
Exemple #48
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    urn : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    m = URN_PATT.match(urn)
    if m is None:
        return None, None

    urn_type, urn_id = m.groups()

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id
            db_name = get_chebi_name_from_id(chebi_id)
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            if uniprot_id is not None:
                db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue',
                      'agi-ncimcelltype']:
        if urn_id.startswith('C') and urn_id[1:].isdigit():
            # Identifier is probably UMLS
            db_refs['UMLS'] = urn_id
        else:
            # Identifier is MESH
            urn_mesh_name = unquote(urn_id)
            mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name)
            if mesh_id:
                db_refs['MESH'] = mesh_id
                db_name = mesh_name
            else:
                db_name = urn_mesh_name
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']
    elif 'GO' in db_refs:
        db_name = go_client.get_go_label(db_refs['GO'])

    return db_refs, db_name
Exemple #49
0
def _fix_agent(agent):
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Change FA name space
        if db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
        elif db_ns == 'GO':
            if db_id.startswith('GO:'):
                db_refs_tmp['GO'] = db_id
            else:
                db_refs_tmp['GO'] = 'GO:' + db_id
        # Change PCID name space
        elif db_ns == 'PCID':
            db_refs_tmp.pop('PCID', None)
            db_refs_tmp['PUBCHEM'] = db_id
    agent.db_refs = db_refs_tmp
    # Check if we have a FPLX entry and handle old BE mappings
    if 'BE' in agent.db_refs:
        agent.db_refs['FPLX'] = agent.db_refs.pop('BE')
    be_id = agent.db_refs.get('FPLX')
    # Try to map to FPLX from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # FPLX takes precedence if we have it
    if be_id:
        agent.db_refs['FPLX'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                agent.db_refs['UP'] = up_id
    elif up_id:
        gene_name = uniprot_client.get_gene_name(up_id)
        if gene_name:
            agent.name = gene_name
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                agent.db_refs['HGNC'] = hgnc_id
        # If it doesn't have a gene name, it's better to just
        # use the raw string name otherwise Sparser sets
        # has Uniprot IDs or mnemonics as the name
        else:
            name = agent.db_refs.get('TEXT', agent.name)
            agent.name = name
Exemple #50
0
def get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = node_modifier_data or 'No node data'
        logger.info("Nodes of type %s not handled: %s",
                    node_func, mod_data)
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" % str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [BoundCondition(get_agent(m), True)
                            for m in members[1:]]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.',
                                name, hgnc_id)
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s',
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac,
               location=to_loc)
    return ag
Exemple #51
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        # TODO: factor this out and reuse fix_agents
        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['FPLX'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                db_refs['UP'] = db_id
                gene_name = uniprot_client.get_gene_name(db_id)
                if gene_name:
                    agent_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXP'] = 'FA:' + db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'FPLX']:
                db_refs[db_ns] = db_id
            # Handle old BE mappings and add them as FPLX
            elif db_ns == 'BE':
                db_refs['FPLX'] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert(agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent
Exemple #52
0
 def _make_agent(self, hprd_id, refseq_id=None):
     if hprd_id is None or hprd_id is nan:
         return None
     # Get the basic info (HGNC name/symbol, Entrez ID) from the
     # ID mappings dataframe
     try:
         egid = self.id_df.loc[hprd_id].EGID
     except KeyError:
         logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
         return None
     if not egid:
         logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
         return None
     # Get the HGNC ID
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     # If we couldn't get an HGNC ID for the Entrez ID, this means that
     # the Entrez ID has been discontinued or replaced.
     if not hgnc_id:
         self.no_hgnc_for_egid.append(egid)
         return None
     # Get the (possibly updated) HGNC Symbol
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     assert hgnc_name is not None
     # See if we can get a Uniprot ID from the HGNC symbol--if there is
     # a RefSeq ID we wil also try to use it to get an isoform specific
     # UP ID, but we will have this one to fall back on. But if we can't
     # get one here, then we skip the Statement
     up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id_from_hgnc:
         self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id))
         return None
     # If we have provided the RefSeq ID, it's because we need to make
     # sure that we are getting the right isoform-specific ID (for sequence
     # positions of PTMs). Here we try to get the Uniprot ID from the
     # Refseq->UP mappings in the protmapper.uniprot_client.
     if refseq_id is not None:
         # Get the Uniprot IDs from the uniprot client
         up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
                                                     reviewed_only=True)
         # Nothing for this RefSeq ID (quite likely because the RefSeq ID
         # is obsolete; take the UP ID from HGNC
         if len(up_ids) == 0:
             self.no_up_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # More than one reviewed entry--no thanks, we'll take the one from
         # HGNC instead
         elif len(up_ids) > 1:
             self.many_ups_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # We got a unique, reviewed UP entry for the RefSeq ID
         else:
             up_id = up_ids[0]
             # If it's the canonical isoform, strip off the '-1'
             if up_id.endswith('-1'):
                 up_id = up_id.split('-')[0]
     # For completeness, get the Refseq ID from the HPRD ID table
     else:
         refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
         up_id = up_id_from_hgnc
     # Make db_refs, return Agent
     db_refs = {'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid,
                'REFSEQ_PROT': refseq_id}
     return Agent(hgnc_name, db_refs=db_refs)
Exemple #53
0
def get_participant(agent):
    # Handle missing Agent as generic protein
    if agent is None:
        return get_generic('protein')
    # The Agent is not missing
    text_name = agent.db_refs.get('TEXT')
    if text_name is None:
        text_name = agent.name
    participant = {}
    participant['entity_text'] = [text_name]
    hgnc_id = agent.db_refs.get('HGNC')
    uniprot_id = agent.db_refs.get('UP')
    chebi_id = agent.db_refs.get('CHEBI')
    pfam_def_ids = agent.db_refs.get('PFAM-DEF')
    # If HGNC grounding is available, that is the first choice
    if hgnc_id:
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    if uniprot_id:
        uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id))
        participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
        participant['entity_type'] = 'protein'
    elif chebi_id:
        pubchem_id = chebi_client.get_pubchem_id(chebi_id)
        participant['identifier'] = 'PUBCHEM:%s' % pubchem_id
        participant['entity_type'] = 'chemical'
    elif pfam_def_ids:
        participant['entity_type'] = 'protein_family'
        participant['entities'] = []
        pfam_def_list = []
        for p in pfam_def_ids.split('|'):
            dbname, dbid = p.split(':')
            pfam_def_list.append({dbname: dbid})
        for pdi in pfam_def_list:
            # TODO: handle non-uniprot protein IDs here
            uniprot_id = pdi.get('UP')
            if uniprot_id:
                entity_dict = {}
                uniprot_mnemonic = \
                    str(uniprot_client.get_mnemonic(uniprot_id))
                gene_name = uniprot_client.get_gene_name(uniprot_id)
                if gene_name is None:
                    gene_name = ""
                entity_dict['entity_text'] = [gene_name]
                entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
                entity_dict['entity_type'] = 'protein'
                participant['entities'].append(entity_dict)
    else:
        participant['identifier'] = ''
        participant['entity_type'] = 'protein'

    features = []
    not_features = []
    # Binding features
    for bc in agent.bound_conditions:
        feature = {
            'feature_type': 'binding_feature',
            'bound_to': {
                # NOTE: get type and identifier for bound to protein
                'entity_type': 'protein',
                'entity_text': [bc.agent.name],
                'identifier': ''
                }
            }
        if bc.is_bound:
            features.append(feature)
        else:
            not_features.append(feature)
    # Modification features
    for mc in agent.mods:
        feature = {
            'feature_type': 'modification_feature',
            'modification_type': mc.mod_type.lower(),
            }
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        if mc.residue is not None:
            feature['aa_code'] = mc.residue
        if mc.is_modified:
            features.append(feature)
        else:
            not_features.append(feature)
    # Mutation features
    for mc in agent.mutations:
        feature = {}
        feature['feature_type'] = 'mutation_feature'
        if mc.residue_from is not None:
            feature['from_aa'] = mc.residue_from
        if mc.residue_to is not None:
            feature['to_aa'] = mc.residue_to
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        features.append(feature)
    if features:
        participant['features'] = features
    if not_features:
        participant['not_features'] = not_features
    return participant
Exemple #54
0
import os
from urllib import request
from pybel import BELGraph
from pybel.dsl import *
from pybel.language import Entity
from pybel.io import from_json_file
from pybel.examples import egf_graph
from indra.statements import *
from indra.sources import bel
from indra.sources.bel import processor as pb
from indra.sources.bel.api import process_cbn_jgif_file, process_pybel_graph
from indra.databases import hgnc_client

mek_hgnc_id = hgnc_client.get_hgnc_id('MAP2K1')
mek_up_id = hgnc_client.get_uniprot_id(mek_hgnc_id)


def test_process_pybel():
    pbp = bel.process_pybel_graph(egf_graph)
    assert pbp.statements


def test_process_jgif():
    test_file_url = 'https://s3.amazonaws.com/bigmech/travis/Hox-2.0-Hs.jgf'
    test_file = 'Hox-2.0-Hs.jgf'
    request.urlretrieve(url=test_file_url, filename=test_file)
    pbp = process_cbn_jgif_file(test_file)

    # Clean up
    os.remove(test_file)
Exemple #55
0
def _get_up_id(hgnc_id):
    up_id = hgnc_client.get_uniprot_id(hgnc_id)
    if not up_id:
        logger.info("No Uniprot ID for HGNC ID %s" % hgnc_id)
    return up_id
Exemple #56
0
 def get_agent(concept, entity):
     name = term_from_uri(concept)
     namespace = namespace_from_uri(entity)
     db_refs = {}
     if namespace == 'HGNC':
         agent_name = name
         hgnc_id = hgnc_client.get_hgnc_id(name)
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning("Couldn't get HGNC ID for HGNC symbol %s" %
                            name)
     elif namespace in ('MGI', 'RGD'):
         agent_name = name
         db_refs[namespace] = name
     elif namespace in ('PFH', 'SFAM'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL family: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace in ('NCH', 'SCOMP'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL complex: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace == 'CHEBI':
         chebi_id = chebi_name_id.get(name)
         if chebi_id:
             db_refs['CHEBI'] = chebi_id
         else:
             logger.warning('CHEBI name %s not found in map.' % name)
         agent_name = name
     elif namespace == 'EGID':
         hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
         db_refs['EGID'] = name
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             agent_name = hgnc_client.get_hgnc_name(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning('Could not map EGID%s to HGNC.' % name)
             agent_name = 'E%s' % name
     else:
         logger.warning('Unhandled entity namespace: %s' % namespace)
         print('%s, %s' % (concept, entity))
         agent_name = name
     agent = Agent(agent_name, db_refs=db_refs)
     return agent
Exemple #57
0
 def map_agents(self, stmts, do_rename=True):
     # Make a copy of the stmts
     mapped_stmts = []
     num_skipped = 0
     # Iterate over the statements
     for stmt in stmts:
         mapped_stmt = deepcopy(stmt)
         # Iterate over the agents
         skip_stmt = False
         for agent in mapped_stmt.agent_list():
             if agent is None or agent.db_refs.get('TEXT') is None:
                 continue
             agent_text = agent.db_refs.get('TEXT')
             # Look this string up in the grounding map
             # If not in the map, leave agent alone and continue
             try:
                 map_db_refs = self.gm[agent_text]
             except KeyError:
                 continue
             # If it's in the map but it maps to None, then filter out
             # this statement by skipping it
             if map_db_refs is None:
                 # Increase counter if this statement has not already
                 # been skipped via another agent
                 if not skip_stmt:
                     num_skipped += 1
                 logger.debug("Skipping %s" % agent_text)
                 skip_stmt = True
             # If it has a value that's not None, map it and add it
             else:
                 # Otherwise, update the agent's db_refs field
                 gene_name = None
                 map_db_refs = deepcopy(self.gm.get(agent_text))
                 up_id = map_db_refs.get('UP')
                 hgnc_sym = map_db_refs.get('HGNC')
                 if up_id and not hgnc_sym:
                     gene_name = uniprot_client.get_gene_name(up_id, False)
                     if gene_name:
                         hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                         if hgnc_id:
                             map_db_refs['HGNC'] = hgnc_id
                 elif hgnc_sym and not up_id:
                     # Override the HGNC symbol entry from the grounding
                     # map with an HGNC ID
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                     if hgnc_id:
                         map_db_refs['HGNC'] = hgnc_id
                         # Now get the Uniprot ID for the gene
                         up_id = hgnc_client.get_uniprot_id(hgnc_id)
                         if up_id:
                             map_db_refs['UP'] = up_id
                     # If there's no HGNC ID for this symbol, raise an
                     # Exception
                     else:
                         raise ValueError('No HGNC ID corresponding to gene '
                                          'symbol %s in grounding map.' %
                                          hgnc_sym)
                 # If we have both, check the gene symbol ID against the
                 # mapping from Uniprot
                 elif up_id and hgnc_sym:
                     # Get HGNC Symbol from Uniprot
                     gene_name = uniprot_client.get_gene_name(up_id)
                     if not gene_name:
                         raise ValueError('No gene name found for Uniprot '
                                          'ID %s (expected %s)' %
                                          (up_id, hgnc_sym))
                     # We got gene name, compare it to the HGNC name
                     else:
                         if gene_name != hgnc_sym:
                             raise ValueError('Gene name %s for Uniprot ID '
                                              '%s does not match HGNC '
                                              'symbol %s given in grounding '
                                              'map.' %
                                              (gene_name, up_id, hgnc_sym))
                         else:
                             hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                             if not hgnc_id:
                                 raise ValueError('No HGNC ID '
                                                  'corresponding to gene '
                                                  'symbol %s in grounding '
                                                  'map.' % hgnc_sym)
                 # Assign the DB refs from the grounding map to the agent
                 agent.db_refs = map_db_refs
                 # Are we renaming right now?
                 if do_rename:
                     # If there's a Bioentities ID, prefer that for the name
                     if agent.db_refs.get('BE'):
                         agent.name = agent.db_refs.get('BE')
                     # Get the HGNC symbol or gene name (retrieved above)
                     elif hgnc_sym is not None:
                         agent.name = hgnc_sym
                     elif gene_name is not None:
                         agent.name = gene_name
         # Check if we should skip the statement
         if not skip_stmt:
             mapped_stmts.append(mapped_stmt)
     logger.info('%s statements filtered out' % num_skipped)
     return mapped_stmts