Ejemplo n.º 1
0
def test_activating_substitution_refinement():
    """Should only be refinement if entities are a refinement and all
    fields match."""
    mc1 = MutCondition('12', 'G', 'D')
    mc2 = MutCondition('61', 'Q', 'L')
    nras1 = Agent('NRAS', mutations=[mc1], db_refs={'HGNC': '7989'})
    nras2 = Agent('NRAS', mutations=[mc2], db_refs={'HGNC': '7989'})
    ras = Agent('RAS', mutations=[mc1], db_refs={'FPLX': 'RAS'})
    st1 = ActiveForm(ras, 'gtpbound', True, evidence=Evidence(text='bar'))
    st2 = ActiveForm(nras1, 'gtpbound', True, evidence=Evidence(text='foo'))
    st3 = ActiveForm(nras2, 'gtpbound', True, evidence=Evidence(text='bar'))
    st4 = ActiveForm(nras1, 'phosphatase', True, evidence=Evidence(text='bar'))
    st5 = ActiveForm(nras1, 'gtpbound', False, evidence=Evidence(text='bar'))
    assert st2.refinement_of(st1, hierarchies)
    assert not st3.refinement_of(st1, hierarchies)
    assert not st4.refinement_of(st1, hierarchies)
    assert not st5.refinement_of(st1, hierarchies)

    assert not st1.refinement_of(st2, hierarchies)
    assert not st3.refinement_of(st2, hierarchies)
    assert not st4.refinement_of(st2, hierarchies)
    assert not st5.refinement_of(st2, hierarchies)

    assert not st1.refinement_of(st3, hierarchies)
    assert not st2.refinement_of(st3, hierarchies)
    assert not st4.refinement_of(st3, hierarchies)
    assert not st5.refinement_of(st3, hierarchies)

    assert not st1.refinement_of(st4, hierarchies)
    assert not st2.refinement_of(st4, hierarchies)
    assert not st3.refinement_of(st4, hierarchies)
    assert not st5.refinement_of(st4, hierarchies)

    assert not st1.refinement_of(st5, hierarchies)
    assert not st2.refinement_of(st5, hierarchies)
    assert not st3.refinement_of(st5, hierarchies)
    assert not st4.refinement_of(st5, hierarchies)
Ejemplo n.º 2
0
    def get_mutation_statistics(self, disease_name, mutation_type):
        study_ids = self._get_studies_from_disease_name(disease_name)
        if not study_ids:
            raise DiseaseNotFoundException
        gene_list = self._get_gene_list()
        mutation_dict = {}
        num_case = 0
        logger.info("Found %d studies and a gene_list of %d elements."
                    % (len(study_ids), len(gene_list)))
        mut_patt = re.compile("([A-Z]+)(\d+)([A-Z]+)")
        for study_id in study_ids:
            try:
                num_case += cbio_client.get_num_sequenced(study_id)
            except Exception as e:
                continue

            mutations = cbio_client.get_mutations(study_id, gene_list,
                                                  mutation_type)

            if not mutations['gene_symbol']:
                logger.info("Found no genes for %s." % study_id)
                continue

            # Create agents from the results of the search.
            agent_dict = {}
            for g, a in zip(mutations['gene_symbol'],
                            mutations['amino_acid_change']):
                m = mut_patt.match(a)
                if m is None:
                    logger.warning("Unrecognized residue: %s" % a)
                    continue
                res_from, pos, res_to = m.groups()
                try:
                    mut = MutCondition(pos, res_from, res_to)
                except InvalidResidueError:
                    logger.warning("Invalid residue: %s or %s."
                                   % (res_from, res_to))
                    continue
                ag = Agent(g, db_refs={'HGNC': hgnc_client.get_hgnc_id(g)},
                           mutations=[mut])
                if g not in agent_dict.keys():
                    agent_dict[g] = []
                agent_dict[g].append(ag)
            if not agent_dict:
                return {}

            # Get the most mutated gene.
            top_gene = max(agent_dict.keys(),
                           key=lambda k: len(agent_dict[k]))
            logger.info("Found %d genes, with top hit %s for %s."
                        % (len(agent_dict.keys()), top_gene, study_id))

            if top_gene not in mutation_dict.keys():
                effect_dict = {'activate': 0, 'deactivate': 0,
                               'other': 0}
                mutation_dict[top_gene] = {'count': 0, 'effects': effect_dict,
                                           'total_effects': 0, 'agents': []}
            for agent in agent_dict[top_gene]:
                # Get the mutations effects for that gene.
                mutation_effect = self.find_mutation_effect(agent)
                if mutation_effect is None:
                    mutation_effect_key = 'other'
                else:
                    mutation_effect_key = mutation_effect
                mutation_dict[top_gene]['count'] += 1
                mutation_dict[top_gene]['effects'][mutation_effect_key] += 1
                mutation_dict[top_gene]['agents'].append(agent)

        # Calculate normalized entries
        for k, v in mutation_dict.items():
            mutation_dict[k]['fraction'] = v['count'] / num_case
            for eff in v['effects'].copy().keys():
                v['effects'][eff + '_percent'] = v['effects'][eff] / v['count']

        return mutation_dict
Ejemplo n.º 3
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}

    ref_counts = Counter(
        [entry['source'] for entry in entity_info['entityId']])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping' %
                        (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entity_info['entityId']:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            gene_name = uniprot_client.get_gene_name(id_dict['idString'])
            if gene_name is not None:
                name = gene_name
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id is not None:
                    # Check to see if we have a conflict with an HGNC id
                    # found from the Entrez id. If so, overwrite with this
                    # one, in which we have greater faith.
                    if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                        msg = ('Inferred HGNC:%s from UP:%s does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], refs['UP'], hgnc_id,
                                refs['EGID'])
                        logger.info(msg)
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning(
                "Unhandled id type: {source}={idString}".format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords