コード例 #1
0
def parse_context_entry(entry, grounder, sentence=None):
    """Return a dict of context type and object processed from an entry."""
    match = re.match(r'(.*): (.*)', entry)
    if not match:
        return None
    context_type, context_txt = match.groups()
    if context_type not in allowed_contexts:
        logger.warning('Unknown context type %s' % context_type)
        return None

    terms = grounder(context_txt, context=sentence)
    if not terms:
        logger.warning('Could not ground %s context: %s'
                       % (context_type, context_txt))
    db_refs = {}
    if terms:
        db_refs = standardize_db_refs({terms[0].term.db:
                                       terms[0].term.id})
    db_refs['TEXT'] = context_txt
    standard_name = None
    if terms:
        standard_name = bio_ontology.get_name(terms[0].term.db,
                                              terms[0].term.id)
    name = standard_name if standard_name else context_txt
    context = RefContext(name=name, db_refs=db_refs)
    return {allowed_contexts[context_type]: context}
コード例 #2
0
ファイル: test_ontology.py プロジェクト: steppi/indra
def test_lspci():
    assert bio_ontology.get_name('LSPCI', '18') == 'Pentane-1,5-Diamine'
    members = bio_ontology.get_children('LSPCI', '18')
    # These are some of the members, not all
    expected_members = {('CAS', '462-94-2'), ('CHEBI', 'CHEBI:18127'),
                        ('CHEMBL', 'CHEMBL119296'), ('PUBCHEM', '273')}
    assert expected_members < set(members)
コード例 #3
0
def get_categories(fplx_node):
    """Return category labels for a given protein family ontology node."""
    children = bio_ontology.get_children(*bio_ontology.get_ns_id(fplx_node),
                                         ns_filter='HGNC')
    children_names = {bio_ontology.get_name(*ch) for ch in children}
    child_categories = {categories[name] for name in children_names
                        if name in categories}
    return child_categories
コード例 #4
0
def get_category(node):
    """Return a category label for a given specific protein ontology node."""
    name = bio_ontology.get_name(*bio_ontology.get_ns_id(node))
    category = categories.get(name)
    if category:
        category_node = category_map[category]
        return category_node
    return None
コード例 #5
0
def test_mesh_replacements():
    assert bio_ontology.get_name('MESH', 'D000086382') == 'COVID-19'
    assert bio_ontology.isrel('MESH', 'C000657245', 'MESH', 'D000086382',
                              {'replaced_by'})
    assert bio_ontology.get_replacement('MESH', 'C000657245') == \
        ('MESH', 'D000086382')
    assert standardize_db_refs({'MESH': 'C000657245'}).get('MESH') == \
        'D000086382'
コード例 #6
0
def get_go_receptors():
    receptor_terms = ['signaling receptor activity']
    receptor_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms
    ]
    receptor_go_ids = expand_with_child_go_terms(receptor_go_ids)
    # Filtering out the nuclear receptors from the receptor list
    receptor_go_ids = {
        r
        for r in receptor_go_ids
        if 'receptor' in bio_ontology.get_name('GO', r)
        or 'sensor' in bio_ontology.get_name('GO', r)
        or 'channel' in bio_ontology.get_name('GO', r)
    }
    nuclear_receptor_go_ids = expand_with_child_go_terms(['GO:0004879'])
    receptor_genes_go = get_genes_for_go_ids(receptor_go_ids) - \
        get_genes_for_go_ids(nuclear_receptor_go_ids)
    receptor_genes_go -= {'NR2C2', 'EGF'}
    return receptor_genes_go
コード例 #7
0
def get_binding_site_name(agent):
    """Return a binding site name from a given agent."""
    # Try to construct a binding site name based on parent
    grounding = agent.get_grounding()
    if grounding != (None, None):
        top_parents = bio_ontology.get_top_level_parents(*grounding)
        if top_parents:
            parent_name = bio_ontology.get_name(*top_parents[0])
            if parent_name:
                return _n(parent_name).lower()
    return _n(agent.name).lower()
コード例 #8
0
def rename_chemical(agent):
    from indra.ontology.bio import bio_ontology
    if agent.db_refs['CHEBI'] in chebi_to_selleck_dict:
        selleckname = chebi_to_selleck_dict[agent.db_refs['CHEBI']]
        if selleckname:
            agent.name = selleckname
            return
    for db_ns, db_id in sorted(agent.db_refs.items()):
        if db_ns in {'TEXT', 'TEXT_NORM', 'CHEBI'}:
            continue
        name = bio_ontology.get_name(db_ns, db_id)
        if name:
            agent.name = name
            return
コード例 #9
0
def get_binding_site_name(agent):
    """Return a binding site name from a given agent."""
    # Try to construct a binding site name based on parent
    grounding = agent.get_grounding()
    # We don't want to accidentally deal with very deep ontological
    # cases here such as CHEBI (e.g., GTP) which requires thousands
    # of lookups to resolve
    if grounding != (None, None) and grounding[0] in {'HGNC', 'FPLX'}:
        top_parents = bio_ontology.get_top_level_parents(*grounding)
        if top_parents:
            parent_name = bio_ontology.get_name(*top_parents[0])
            if parent_name:
                return _n(parent_name).lower()
    return _n(agent.name).lower()
コード例 #10
0
    def _require_agent(self, ag, ns, num=None):
        if not self.strict:
            return

        if ns in ['NAME', 'FPLX', ]:
            name = ag
        elif ns != 'TEXT':
            name = bio_ontology.get_name(ns, ag)
        else:
            # If the namespace is TEXT, what do we do?
            return

        self.agent_set.add(name)
        if num is not None:
            self.agent_dict[num] = name
        return
コード例 #11
0
 def applicable(model, test):
     """Return True of all test entities are in the set of model entities"""
     model_entities = model.entities
     test_entities = test.get_entities()
     test_entity_groups = []
     for te in test_entities:
         te_group = [te]
         ns, gr = te.get_grounding()
         children = bio_ontology.get_children(ns, gr)
         for ns, gr in children:
             name = bio_ontology.get_name(ns, gr)
             ag = Agent(name, db_refs={ns: gr})
             te_group.append(ag)
         test_entity_groups.append(te_group)
     return RefinementTestConnector._overlap(model_entities,
                                             test_entity_groups)
コード例 #12
0
ファイル: processor.py プロジェクト: sorgerlab/indra
def _make_famplex_lookup():
    """Create a famplex lookup dictionary.

    Keys are sorted tuples of HGNC gene names and values are
    the corresponding FamPlex ID.
    """

    fplx_lookup = {}
    bio_ontology.initialize()
    for node in bio_ontology.nodes:
        ns, id = bio_ontology.get_ns_id(node)
        if ns == 'FPLX':
            children = bio_ontology.get_children(ns, id)
            hgnc_children = [
                bio_ontology.get_name(*c) for c in children if c[0] == 'HGNC'
            ]
            fplx_lookup[tuple(sorted(hgnc_children))] = id
    return fplx_lookup
コード例 #13
0
 def _add_node(self, agent, uuid=None):
     node_key = agent.name
     node_id = self._existing_nodes.get(node_key)
     # if the node already exists we do not want to add it again
     # we must however add its uuid
     if node_id is not None:
         # fetch the appropriate node
         n = [x for x in self._nodes if x['data']['id'] == node_id][0]
         uuid_list = n['data']['uuid_list']
         if uuid not in uuid_list:
             uuid_list.append(uuid)
         return node_id
     db_refs = _get_db_refs(agent)
     node_id = self._get_new_id()
     self._existing_nodes[node_key] = node_id
     node_name = agent.name
     node_name = node_name.replace('_', ' ')
     if 'FPLX' in db_refs:
         expanded_families = bio_ontology.get_children(
             *agent.get_grounding(), ns_filter={'HGNC'})
     else:
         expanded_families = []
     members = {}
     for member in expanded_families:
         member_db_refs = {member[0]: member[1]}
         member_db_refs = standardize_db_refs(member_db_refs)
         gene_name = bio_ontology.get_name(*member)
         members[gene_name] = {'db_refs': {}}
         for dbns, dbid in member_db_refs.items():
             url = get_identifiers_url(dbns, dbid)
             if url:
                 members[gene_name]['db_refs'][dbns] = url
     node = {
         'data': {
             'id': node_id,
             'name': node_name,
             'db_refs': db_refs,
             'parent': '',
             'members': members,
             'uuid_list': [uuid]
         }
     }
     self._nodes.append(node)
     return node_id
コード例 #14
0
def get_pain_mol():
    PAIN_SIGNAL_MOL = {
        "Prostaglandins": "CHEBI:26333",
        "Brandykinin": "CHEBI:3165"
    }

    CHEBI_LIST = {}
    CHEBI_NAMES = {}
    for compounds, chebi_id in PAIN_SIGNAL_MOL.items():
        CHEBI_LIST[compounds] = \
            [children[1] for children in
             bio_ontology.get_children('CHEBI',
                                       chebi_id)]

        CHEBI_NAMES[compounds] = \
            [bio_ontology.get_name('CHEBI', ids)
             for ids in CHEBI_LIST[compounds]]

    return CHEBI_NAMES
コード例 #15
0
def get_receptors():
    receptor_terms = ['signaling receptor activity']
    receptor_go_ids = [
        bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms
    ]
    receptor_go_ids = expand_with_child_go_terms(receptor_go_ids)
    # Filtering out the nuclear receptors from the receptor list
    receptor_go_ids = {r for r in receptor_go_ids if 'receptor' in
                       bio_ontology.get_name('GO', r)} - \
                      expand_with_child_go_terms(['GO:0004879'])
    receptor_genes_go = get_genes_for_go_ids(receptor_go_ids)
    receptor_genes_go -= {'NR2C2', 'EGF'}
    # Add ION channels to the receptor list
    ion_channels = set()
    with open(ION_CHANNELS, 'r') as fh:
        for line in fh:
            ion_channels.add(line.strip())
    receptor_genes_go |= ion_channels
    return receptor_genes_go
コード例 #16
0
def get_enzyme_products(de_enzymes):
    df = pd.read_csv(PC_SIF_URL, sep='\t', header=None)
    logFC_list = list(de_enzymes.keys())
    de_enzymes_list = list(de_enzymes.values())
    filtered_df = [
        {
            'Enzyme': s[0],
            'Interaction': s[1],
            'product': s[2],
            'logFC': logFC_list[de_enzymes_list.index(s[0])]
        } for _, s in df.iterrows()
        if s[0] in de_enzymes_list and re.match('controls-production-of', s[1])
    ]

    # If there are any CHEBI ID's, then convert
    # their ID's to names
    filtered_df = pd.DataFrame(filtered_df)
    for rows, s in filtered_df.iterrows():
        if s[2].startswith("CHEBI"):
            filtered_df.at[rows, 'product'] = \
            bio_ontology.get_name('CHEBI', s[2])
    return filtered_df.sort_values(by='logFC', ascending=False)
コード例 #17
0
def unify_lspci(stmts):
    from indra.statements.agent import default_ns_order
    from indra.ontology.bio import bio_ontology
    logger.info('Unifying by LSPCI with %d statements' % len(stmts))
    orig_ns_order = indra.statements.agent.default_ns_order[:]
    indra.statements.agent.default_ns_order = ['LSPCI'] + \
        indra.statements.agent.default_ns_order
    agents_by_lspci = defaultdict(list)
    ns_order = default_ns_order + ['CHEMBL', 'DRUGBANK', 'HMS-LINCS', 'CAS']
    for stmt in stmts:
        for agent in stmt.real_agent_list():
            if 'LSPCI' in agent.db_refs:
                agents_by_lspci[agent.db_refs['LSPCI']].append(agent)
            else:
                agent_gr = agent.get_grounding(ns_order=ns_order)
                if agent_gr[0] is None:
                    continue
                else:
                    parents = bio_ontology.get_parents(*agent_gr)
                    lspci_parents = [p[1] for p in parents if p[0] == 'LSPCI']
                    if len(lspci_parents) != 1:
                        continue
                    lspci_parent = lspci_parents[0]
                    agents_by_lspci[lspci_parent].append(agent)

    for lspci, agents in agents_by_lspci.items():
        lspci_name = bio_ontology.get_name('LSPCI', lspci)
        standard_name = lspci_name if lspci_name else agents[0].name
        for agent in agents:
            agent.db_refs['LSPCI'] = lspci
            agent.name = standard_name

    unique_stmts = ac.run_preassembly(stmts, run_refinement=False)
    indra.statements.agent.default_ns_order = orig_ns_order
    logger.info('Finished unification with %d statements' % len(unique_stmts))
    return unique_stmts
コード例 #18
0
ファイル: dump_sif.py プロジェクト: pagreene/indra_db
def normalize_sif_names(sif_df: DataFrame):
    """Try to normalize names in the sif dump dataframe

    This function tries to normalize the names of the entities in the sif
    dump. The 'bio_ontology' is the arbiter of what constitutes a normalized
    name. If no name exists, no further attempt to change the name is made.

    Parameters
    ----------
    sif_df :
        The sif dataframe
    """
    from indra.ontology.bio import bio_ontology
    bio_ontology.initialize()
    logger.info('Getting ns, id, name tuples')

    # Get the set of grounded entities
    ns_id_name_tups = set(
        zip(sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
        set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name))
    )

    # Get the ontology name, if it exists, and check if the name in the
    # dataframe needs update
    logger.info('Checking which names need updating')
    inserted_set = set()
    for ns_, id_, cur_name in tqdm(ns_id_name_tups):
        oname = bio_ontology.get_name(ns_, id_)
        # If there is a name in the ontology and it is different than the
        # original, insert it
        if oname and oname != cur_name and (ns_, id_, oname) not in inserted_set:
            inserted_set.add((ns_, id_, oname))

    if len(inserted_set) > 0:
        logger.info(f'Found {len(inserted_set)} names in dataframe that need '
                    f'renaming')

        # Make dataframe of rename dict
        logger.info('Making rename dataframe')
        df_dict = defaultdict(list)
        for ns_, id_, name in inserted_set:
            df_dict['ns'].append(ns_)
            df_dict['id'].append(id_)
            df_dict['name'].append(name)

        rename_df = pd.DataFrame(df_dict)

        # Do merge on with relevant columns from sif for both A and B
        logger.info('Getting temporary dataframes for renaming')

        # Get dataframe with ns, id, new name column
        rename_a = sif_df[['agA_ns', 'agA_id']].merge(
            right=rename_df,
            left_on=['agA_ns', 'agA_id'],
            right_on=['ns', 'id'], how='left'
        ).drop('ns', axis=1).drop('id', axis=1)

        # Check which rows have name entries
        truthy_a = pd.notna(rename_a.name)

        # Rename in sif_df from new names
        sif_df.loc[truthy_a, 'agA_name'] = rename_a.name[truthy_a]

        # Repeat for agB_name
        rename_b = sif_df[['agB_ns', 'agB_id']].merge(
            right=rename_df,
            left_on=['agB_ns', 'agB_id'],
            right_on=['ns', 'id'], how='left'
        ).drop('ns', axis=1).drop('id', axis=1)
        truthy_b = pd.notna(rename_b.name)
        sif_df.loc[truthy_b, 'agB_name'] = rename_b.name[truthy_b]

        # Check that there are no missing names
        logger.info('Performing sanity checks')
        assert sum(pd.isna(sif_df.agA_name)) == 0
        assert sum(pd.isna(sif_df.agB_name)) == 0

        # Get the set of ns, id, name tuples and check diff
        ns_id_name_tups_after = set(
            zip(sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
            set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name))
        )
        # Check that rename took place
        assert ns_id_name_tups_after != ns_id_name_tups
        # Check that all new names are used
        assert set(rename_df.name).issubset({n for _, _, n in ns_id_name_tups_after})
        logger.info('Sif dataframe renamed successfully')
    else:
        logger.info('No names need renaming')
コード例 #19
0
ファイル: sbgn_colorizer.py プロジェクト: kolusask/bioagents
    def set_style_expression_mutation(self, model, cell_line='A375_SKIN'):
        """Sets the fill color of each node based on its expression level
        on the given cell line, and the stroke color based on whether it is
        a mutation.

        Parameters
        ----------
        model: list<indra.statements.Statement>
            A list of INDRA statements
        cell_line: str
            A cell line for which we're interested in protein expression level
        """
        labels = self.label_to_glyph_ids.keys()

        label_to_agent = {}
        for label in labels:
            for statement in model:
                for agent in statement.agent_list():
                    if agent is not None and _n(agent.name) == label:
                        label_to_agent[label] = agent

        agent_to_expression_level = {}
        for agent in label_to_agent.values():
            if 'HGNC' not in agent.db_refs and 'FPLX' not in agent.db_refs:
                # This is not a gene
                agent_to_expression_level[agent] = 0
                continue

            if 'FPLX' not in agent.db_refs:
                gene_names = [agent.name]
            else:
                children = bio_ontology.get_children('FPLX',
                                                     agent.db_refs['FPLX'])
                gene_names = [bio_ontology.get_name(*child) for child
                              in children]

            # Compute mean expression level
            expression_levels = []
            logger.info('Getting expression status of proteins: %s' %
                        str(gene_names))
            l = self.get_expression(gene_names, cell_line)
            for line in l:
                for element in l[line]:
                    level = l[line][element]
                    if level is not None:
                        expression_levels.append(l[line][element])
            if len(expression_levels) == 0:
                mean_level = None
            else:
                mean_level = sum(expression_levels) / len(expression_levels)

            agent_to_expression_level[agent] = mean_level

        # Create a normalized expression score between 0 and 1
        # Compute min and maximum levels
        min_level = None
        max_level = None
        for agent, level in agent_to_expression_level.items():
            if level is None:
                continue
            if min_level is None:
                min_level = level
            if max_level is None:
                max_level = level
            if level < min_level:
                min_level = level
            if level > max_level:
                max_level = level
        # Compute scores
        agent_to_score = {}
        if max_level is not None:
            level_span = max_level - min_level
        for agent, level in agent_to_expression_level.items():
            if level is None or level_span == 0:
                agent_to_score[agent] = 0
            else:
                agent_to_score[agent] = (level - min_level) / level_span

        # Map scores to colors and assign colors to labels
        agent_to_color = {}
        for agent, score in agent_to_score.items():
            if 'HGNC' not in agent.db_refs and 'FPLX' not in agent.db_refs:
                color = cm.Blues(0.3)
                color_str = colors.to_hex(color[:3])
            else:
                # color = cm.plasma(score)
                color = cm.Greens(0.6*score + 0.2)
                color_str = colors.to_hex(color[:3])
            assert(len(color_str) == 7)
            stroke_color = \
                    self._choose_stroke_color_from_mutation_status(agent.name,
                                                                   cell_line)
            self.set_style(agent.name, stroke_color, color_str)
コード例 #20
0
 def get_name(namespace: str, identifier: str) -> str:
     name = bio_ontology.get_name(namespace, identifier)
     return name
コード例 #21
0
                                 for compound, names in PAIN_MOL_NAMES.items()
                                 if rows[2] in names]
    df = pd.DataFrame(celltype_pain_interaction)
    return df


if __name__ == "__main__":
    df = pd.read_csv(PC_SIF_URL, sep='\t', header=None)
    df = df[df[1] == 'controls-production-of']

    pain_signal_mol = {
        "Prostaglandins": "CHEBI:26333",
        "Brandykinin": "CHEBI:3165"
    }

    chebi_list = {}
    for compounds, chebi_id in pain_signal_mol.items():
        chebi_list[compounds] = [
            children[1]
            for children in bio_ontology.get_children('CHEBI', chebi_id)
        ]
    df = df[df[2].isin(chebi_list)]
    chebi_stmts = [{
        'Enzyme': row[0],
        'Statement': row[1],
        'CHEBI_ID': row[2],
        'CHEBI_Name': bio_ontology.get_name('CHEBI', row[2])
    } for _, row in df.iterrows()]
    df = pd.DataFrame(chebi_stmts)
    df.to_csv("enzyme_interactions.tsv", sep="\t", header=True, index=False)