def test_activating_substitution_refinement(): """Should only be refinement if entities are a refinement and all fields match.""" mc1 = MutCondition('12', 'G', 'D') mc2 = MutCondition('61', 'Q', 'L') nras1 = Agent('NRAS', mutations=[mc1], db_refs={'HGNC': '7989'}) nras2 = Agent('NRAS', mutations=[mc2], db_refs={'HGNC': '7989'}) ras = Agent('RAS', mutations=[mc1], db_refs={'FPLX': 'RAS'}) st1 = ActiveForm(ras, 'gtpbound', True, evidence=Evidence(text='bar')) st2 = ActiveForm(nras1, 'gtpbound', True, evidence=Evidence(text='foo')) st3 = ActiveForm(nras2, 'gtpbound', True, evidence=Evidence(text='bar')) st4 = ActiveForm(nras1, 'phosphatase', True, evidence=Evidence(text='bar')) st5 = ActiveForm(nras1, 'gtpbound', False, evidence=Evidence(text='bar')) assert st2.refinement_of(st1, hierarchies) assert not st3.refinement_of(st1, hierarchies) assert not st4.refinement_of(st1, hierarchies) assert not st5.refinement_of(st1, hierarchies) assert not st1.refinement_of(st2, hierarchies) assert not st3.refinement_of(st2, hierarchies) assert not st4.refinement_of(st2, hierarchies) assert not st5.refinement_of(st2, hierarchies) assert not st1.refinement_of(st3, hierarchies) assert not st2.refinement_of(st3, hierarchies) assert not st4.refinement_of(st3, hierarchies) assert not st5.refinement_of(st3, hierarchies) assert not st1.refinement_of(st4, hierarchies) assert not st2.refinement_of(st4, hierarchies) assert not st3.refinement_of(st4, hierarchies) assert not st5.refinement_of(st4, hierarchies) assert not st1.refinement_of(st5, hierarchies) assert not st2.refinement_of(st5, hierarchies) assert not st3.refinement_of(st5, hierarchies) assert not st4.refinement_of(st5, hierarchies)
def get_mutation_statistics(self, disease_name, mutation_type): study_ids = self._get_studies_from_disease_name(disease_name) if not study_ids: raise DiseaseNotFoundException gene_list = self._get_gene_list() mutation_dict = {} num_case = 0 logger.info("Found %d studies and a gene_list of %d elements." % (len(study_ids), len(gene_list))) mut_patt = re.compile("([A-Z]+)(\d+)([A-Z]+)") for study_id in study_ids: try: num_case += cbio_client.get_num_sequenced(study_id) except Exception as e: continue mutations = cbio_client.get_mutations(study_id, gene_list, mutation_type) if not mutations['gene_symbol']: logger.info("Found no genes for %s." % study_id) continue # Create agents from the results of the search. agent_dict = {} for g, a in zip(mutations['gene_symbol'], mutations['amino_acid_change']): m = mut_patt.match(a) if m is None: logger.warning("Unrecognized residue: %s" % a) continue res_from, pos, res_to = m.groups() try: mut = MutCondition(pos, res_from, res_to) except InvalidResidueError: logger.warning("Invalid residue: %s or %s." % (res_from, res_to)) continue ag = Agent(g, db_refs={'HGNC': hgnc_client.get_hgnc_id(g)}, mutations=[mut]) if g not in agent_dict.keys(): agent_dict[g] = [] agent_dict[g].append(ag) if not agent_dict: return {} # Get the most mutated gene. top_gene = max(agent_dict.keys(), key=lambda k: len(agent_dict[k])) logger.info("Found %d genes, with top hit %s for %s." % (len(agent_dict.keys()), top_gene, study_id)) if top_gene not in mutation_dict.keys(): effect_dict = {'activate': 0, 'deactivate': 0, 'other': 0} mutation_dict[top_gene] = {'count': 0, 'effects': effect_dict, 'total_effects': 0, 'agents': []} for agent in agent_dict[top_gene]: # Get the mutations effects for that gene. mutation_effect = self.find_mutation_effect(agent) if mutation_effect is None: mutation_effect_key = 'other' else: mutation_effect_key = mutation_effect mutation_dict[top_gene]['count'] += 1 mutation_dict[top_gene]['effects'][mutation_effect_key] += 1 mutation_dict[top_gene]['agents'].append(agent) # Calculate normalized entries for k, v in mutation_dict.items(): mutation_dict[k]['fraction'] = v['count'] / num_case for eff in v['effects'].copy().keys(): v['effects'][eff + '_percent'] = v['effects'][eff] / v['count'] return mutation_dict
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} ref_counts = Counter( [entry['source'] for entry in entity_info['entityId']]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entity_info['entityId']: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id is not None: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning( "Unhandled id type: {source}={idString}".format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords