コード例 #1
0
ファイル: fplx_evaluation.py プロジェクト: steppi/gilda
def process_fplx_groundings(df):
    groundings = []
    # Iterate over the rows of the curation table and extract groundings
    for _, row in df.iterrows():
        if pandas.isnull(row['Grounding']):
            break
        # Here we get the original entity text, its type, and the
        # correct/incorrect curation
        grounding = {
            'text': row['Text'],
            'entity_type': row['EntityType'],
            'db_refs': {},
            'correct': bool(int(row['Grounding'])),
            'context': row['Sentence']
        }
        # We then extract the grounding (up to 3) that were considered
        # for the curation
        for i in [1, 2, 3]:
            if not pandas.isnull(row['DB_Ns%d' % i]):
                grounding['db_refs'][row['DB_Ns%d' % i]] = row['DB_Id%d' % i]
        # We standardize some of the grounding entries to match up with
        # Gilda's format
        for k, v in copy.deepcopy(grounding['db_refs']).items():
            # Strip off extra GO prefixes
            if v.startswith('GO:GO'):
                grounding['db_refs'][k] = v[3:]
            # Get CHEBI IDs from PUBCHEM
            if k == 'PUBCHEM':
                chebi_id = chebi_client.get_chebi_id_from_pubchem(v)
                if chebi_id:
                    grounding['db_refs']['CHEBI'] = 'CHEBI:%s' % chebi_id
        groundings.append(grounding)
    return groundings
コード例 #2
0
ファイル: processor.py プロジェクト: johnbachman/indra
 def _extract_drug(self, hms_id):
     refs = self._lc.get_small_molecule_refs(hms_id)
     name = self._lc.get_small_molecule_name(hms_id)
     if 'PUBCHEM' in refs:
         chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM'])
         if chebi_id:
             refs['CHEBI'] = 'CHEBI:%s' % chebi_id
     return Agent(name, db_refs=refs)
コード例 #3
0
ファイル: processor.py プロジェクト: maparent/indra
    def _extract_drug(self, line):
        drug_name = line['Small Molecule Name']
        lincs_id = line['Small Molecule HMS LINCS ID']
        refs = self._lc.get_small_molecule_refs(lincs_id)
        if 'PUBCHEM' in refs:
            chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM'])
            if chebi_id:
                refs['CHEBI'] = chebi_id

        return Agent(drug_name, db_refs=refs)
コード例 #4
0
ファイル: processor.py プロジェクト: johnbachman/indra
    def _extract_drug(self, line):
        drug_name = line['Small Molecule Name']
        lincs_id = line['Small Molecule HMS LINCS ID']
        refs = self._lc.get_small_molecule_refs(lincs_id)
        if 'PUBCHEM' in refs:
            chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM'])
            if chebi_id:
                refs['CHEBI'] = 'CHEBI:%s' % chebi_id

        return Agent(drug_name, db_refs=refs)
コード例 #5
0
ファイル: processor.py プロジェクト: johnbachman/indra
    def _get_agent(self, participant):
        dbid = participant.get('identifier')
        text = participant.get('entity_text')[0]

        if dbid == 'GENERIC':
            if not text:
                return None
            else:
                return Agent(text)

        db_refs = {}
        entity_type = participant.get('entity_type')
        if entity_type in ['protein', 'chemical', 'gene']:
            # TODO: standardize name here
            name = participant.get('entity_text')[0]
            db_refs['TEXT'] = text
            if dbid:
                db_name, db_id = dbid.split(':')
                if db_name.lower() == 'uniprot':
                    uniprot_id = uniprot_client.get_id_from_mnemonic(db_id)
                    db_refs['UP'] = uniprot_id
                elif db_name.lower() == 'pubchem':
                    chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id)
                    db_refs['CHEBI'] = chebi_id
                elif db_name.lower() == 'hgnc':
                    db_refs['HGNC'] = db_id
        elif entity_type == 'protein_family':
            name = text
        else:
            return None
        # TODO: handle other participant types
        agent = Agent(name, db_refs=db_refs)

        features = participant.get('features')
        if features:
            for feature in features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    agent.bound_conditions.append(bc)
                elif feature_type == 'mutation_feature':
                    mc = self._get_mut_condition(feature)
                    agent.mutations.append(mc)
                elif feature_type == 'location_feature':
                    agent.location = feature.get('location')
        not_features = participant.get('features')
        if not_features:
            for feature in not_features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    mc.is_modified = False
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    bc.is_bound = False
                    agent.bound_conditions.append(bc)
        return agent
コード例 #6
0
    def _get_agent(self, participant):
        dbid = participant.get('identifier')
        text = participant.get('entity_text')[0]

        if dbid == 'GENERIC':
            if not text:
                return None
            else:
                return Agent(text)

        db_refs = {}
        entity_type = participant.get('entity_type')
        if entity_type in ['protein', 'chemical', 'gene']:
            # TODO: standardize name here
            name = participant.get('entity_text')[0]
            db_refs['TEXT'] = text
            if dbid:
                db_name, db_id = dbid.split(':')
                if db_name.lower() == 'uniprot':
                    uniprot_id = uniprot_client.get_id_from_mnemonic(db_id)
                    db_refs['UP'] = uniprot_id
                elif db_name.lower() == 'pubchem':
                    chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id)
                    db_refs['CHEBI'] = chebi_id
                elif db_name.lower() == 'hgnc':
                    db_refs['HGNC'] = db_id
        elif entity_type == 'protein_family':
            name = text
        else:
            return None
        # TODO: handle other participant types
        agent = Agent(name, db_refs=db_refs)

        features = participant.get('features')
        if features:
            for feature in features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    agent.bound_conditions.append(bc)
                elif feature_type == 'mutation_feature':
                    mc = self._get_mut_condition(feature)
                    agent.mutations.append(mc)
                elif feature_type == 'location_feature':
                    agent.location = feature.get('location')
        not_features = participant.get('features')
        if not_features:
            for feature in not_features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    mc.is_modified = False
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    bc.is_bound = False
                    agent.bound_conditions.append(bc)
        return agent
コード例 #7
0
def test_chebi_pubchem_mapping():
    # This is a non-trivial mapping since there are multiple mappings
    # reported by ChEBI and we need to choose the right one based on
    # InChIKey matches.
    assert chebi_client.get_chebi_id_from_pubchem('5287993') == 'CHEBI:3528'
    assert chebi_client.get_pubchem_id('CHEBI:3528') == '5287993'
コード例 #8
0
"""This script helps identify entries in PubChem.tsv that systematically
lead to incorrect groundings and should therefore be removed."""

import os
import re
from indra.databases import chebi_client

if __name__ == '__main__':
    # Basic positioning
    here = os.path.dirname(os.path.abspath(__file__))
    kb_dir = os.path.join(here, os.pardir, 'src', 'main', 'resources', 'org',
                          'clulab', 'reach', 'kb')
    resource_fname = os.path.join(kb_dir, 'PubChem.tsv')

    keep_rows = []
    with open(resource_fname, 'r') as fh:
        for row in fh.readlines():
            if '\t' not in row:
                continue
            txt, id = [x.strip() for x in row.split('\t')]
            if re.match(r'^[A-Z][A-Z]$', txt):
                chebi_id = chebi_client.get_chebi_id_from_pubchem(id)
                name = chebi_client.get_chebi_name_from_id(chebi_id)
                if name and  '-' in name and len(name) == 7:
                    continue
            keep_rows.append(row)
    with open(resource_fname, 'w') as fh:
        for row in keep_rows:
            fh.write(row)
コード例 #9
0
    def standardize_db_refs(db_refs):
        """Return a standardized db refs dict for a given db refs dict.

        Parameters
        ----------
        db_refs : dict
            A dict of db refs that may not be standardized, i.e., may be
            missing an available UP ID corresponding to an existing HGNC ID.

        Returns
        -------
        dict
            The db_refs dict with standardized entries.
        """
        up_id = db_refs.get('UP')
        hgnc_id = db_refs.get('HGNC')
        # If we have a UP ID and no HGNC ID, we try to get a gene name,
        # and if possible, a HGNC ID from that
        if up_id and not hgnc_id and uniprot_client.is_human(up_id):
            gene_name = uniprot_client.get_gene_name(up_id, False)
            if gene_name:
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
        # Otherwise, if we don't have a UP ID but have an HGNC ID, we try to
        # get the UP ID
        elif hgnc_id:
            # Now get the Uniprot ID for the gene
            mapped_up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if mapped_up_id:
                # If we find an inconsistency, we explain it in an error
                # message and fall back on the mapped ID
                if up_id and up_id != mapped_up_id:
                    # We handle a special case here in which mapped_up_id is
                    # actually a list of UP IDs that we skip and just keep
                    # the original up_id
                    if ', ' not in mapped_up_id:
                        # If we got a proper single protein mapping, we use
                        # the mapped_up_id to standardize to.
                        msg = ('Inconsistent groundings UP:%s not equal to '
                               'UP:%s mapped from HGNC:%s, standardizing to '
                               'UP:%s' %
                               (up_id, mapped_up_id, hgnc_id, mapped_up_id))
                        logger.debug(msg)
                        db_refs['UP'] = mapped_up_id
                # If there is no conflict, we can update the UP entry
                else:
                    db_refs['UP'] = mapped_up_id

        # Now try to improve chemical groundings
        pc_id = db_refs.get('PUBCHEM')
        chebi_id = db_refs.get('CHEBI')
        hmdb_id = db_refs.get('HMDB')
        mapped_chebi_id = None
        mapped_pc_id = None
        hmdb_mapped_chebi_id = None
        # If we have original PUBCHEM and CHEBI IDs, we always keep those:
        if pc_id:
            mapped_chebi_id = chebi_client.get_chebi_id_from_pubchem(pc_id)
            if mapped_chebi_id and not mapped_chebi_id.startswith('CHEBI:'):
                mapped_chebi_id = 'CHEBI:%s' % mapped_chebi_id
        if chebi_id:
            mapped_pc_id = chebi_client.get_pubchem_id(chebi_id)
        if hmdb_id:
            hmdb_mapped_chebi_id = chebi_client.get_chebi_id_from_hmdb(hmdb_id)
            if hmdb_mapped_chebi_id and \
                    not hmdb_mapped_chebi_id.startswith('CHEBI:'):
                hmdb_mapped_chebi_id = 'CHEBI:%s' % hmdb_mapped_chebi_id
        # We always keep originals if both are present but display warnings
        # if there are inconsistencies
        if pc_id and chebi_id and mapped_pc_id and pc_id != mapped_pc_id:
            msg = ('Inconsistent groundings PUBCHEM:%s not equal to '
                   'PUBCHEM:%s mapped from %s, standardizing to '
                   'PUBCHEM:%s.' % (pc_id, mapped_pc_id, chebi_id, pc_id))
            logger.debug(msg)
        elif pc_id and chebi_id and mapped_chebi_id and chebi_id != \
                mapped_chebi_id:
            msg = ('Inconsistent groundings %s not equal to '
                   '%s mapped from PUBCHEM:%s, standardizing to '
                   '%s.' % (chebi_id, mapped_chebi_id, pc_id, chebi_id))
            logger.debug(msg)
        # If we have PC and not CHEBI but can map to CHEBI, we do that
        elif pc_id and not chebi_id and mapped_chebi_id:
            db_refs['CHEBI'] = mapped_chebi_id
        elif hmdb_id and chebi_id and hmdb_mapped_chebi_id and \
                hmdb_mapped_chebi_id != chebi_id:
            msg = ('Inconsistent groundings %s not equal to '
                   '%s mapped from %s, standardizing to '
                   '%s.' % (chebi_id, hmdb_mapped_chebi_id, hmdb_id, chebi_id))
            logger.debug(msg)
        elif hmdb_id and not chebi_id and hmdb_mapped_chebi_id:
            db_refs['CHEBI'] = hmdb_mapped_chebi_id
        # If we have CHEBI and not PC but can map to PC, we do that
        elif chebi_id and not pc_id and mapped_pc_id:
            db_refs['PUBCHEM'] = mapped_pc_id
        # Otherwise there is no useful mapping that we can add and no
        # further conflict to resolve.
        return db_refs