def create_test_kinase(name, refseq): interactor = Kinase(name=name) kinase_gene = Gene(name='Gene of ' + interactor.name) kinase_protein = Protein(refseq=refseq, gene=kinase_gene) interactor.protein = kinase_protein return interactor
def get_or_create_kinases( chosen_kinases_names, known_kinases, known_kinase_groups) -> [Set[Kinase], Set[KinaseGroup]]: """Create a subset of known kinases and known kinase groups based on given list of kinases names ('chosen_kinases_names'). If no kinase or kinase group of given name is known, it will be created. Returns a tuple of sets: kinases, groups """ kinases, groups = set(), set() for name in set(chosen_kinases_names): # handle kinases group if name.endswith('_GROUP'): name = name[:-6] key = name.lower() if key not in known_kinase_groups: known_kinase_groups[key] = KinaseGroup(name=name) groups.add(known_kinase_groups[key]) # if it's not a group, it surely is a kinase: else: key = name.lower() if key not in known_kinases: known_kinases[key] = Kinase( name=name, protein=get_preferred_gene_isoform(name)) kinases.add(known_kinases[key]) return kinases, groups
def parser(line): # note that the subfamily is often absent group, family, subfamily = line[2:5] # the 'gene.clean' [6] fits better to the names # of kinases used in all other data files kinase_name = line[6] # 'group.clean' is not atomic and is redundant with respect to # family and subfamily. This check assures that in case of a change # the maintainer would be able to spot the inconsistency easily clean = family + '_' + subfamily if subfamily else family assert line[8] == clean if kinase_name.lower() not in known_kinases: kinase = Kinase(name=kinase_name, protein=get_preferred_gene_isoform(kinase_name)) known_kinases[kinase_name.lower()] = kinase # the 'family' corresponds to 'group' in the all other files if family.lower() not in known_groups: group = KinaseGroup(name=family) known_groups[family.lower()] = group new_groups.append(group) known_groups[family.lower()].kinases.append( known_kinases[kinase_name.lower()])
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') protein.gene.preferred_isoform = protein MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Sample A,Sample B', count=2) InheritedMutation(mutation=mutation, clin_data=[ ClinicalData(disease=Disease(name='Some disease'), sig_code=5), ClinicalData(disease=Disease(name='Other disease'), sig_code=2) ]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases={kinase}, pmid={1, 2}, types={SiteType(name='glycosylation')}) protein.sites = [site] return locals()
def test_train_model(self): phosphorylation = SiteType(name='phosphorylation') # non-phosphorylated serine residues are needed to generate negative sites p = Protein(refseq='NM_007', sequence='--------SLPA-----------SVIT-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) # phosphorylated, with sites p = Protein(refseq='NM_001', sequence='--------SPAK-----------SPAR-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) k = Kinase(name='CDK1', is_involved_in={phosphorylation}) for pos in [9, 24]: s = Site(position=pos, types={phosphorylation}, residue='S', protein=p, kinases={k}) db.session.add(s) db.session.commit() with TemporaryDirectory() as temp_dir: model = train_model(phosphorylation, sequences_dir=temp_dir, sampling_n=2, threshold=2) # the model should have one set of params - for CDK1 kinase assert len(model) == 1 cdk_params = model.rx2('CDK1') pwm = cdk_params.rx2('pwm') # and the position-specific weight matrix should be created assert pwm # the very detailed testing should be performed by rMIMP, # but why not test the basics? weights_of_central_aa = { aa: value for aa, value in zip(pwm.rownames, pwm.rx(True, 8)) } assert weights_of_central_aa['S'] == max( weights_of_central_aa.values())
def test_interactions(self): from models import Protein, Site, Kinase, KinaseGroup p1 = Protein(sites=[ Site(), Site(kinases=[Kinase()], kinase_groups=[KinaseGroup()]) ]) db.session.add(p1) p2 = Protein(sites=[Site(kinases=[Kinase()])]) db.session.add(p2) u_all_interactions = 0 u_kinases_covered = set() u_kinase_groups_covered = set() u_proteins_covered = set() for protein in models.Protein.query.all(): for site in protein.sites: kinases = site.kinases kinase_groups = site.kinase_groups u_all_interactions += len(kinases) + len(kinase_groups) u_kinases_covered.update(kinases) u_kinase_groups_covered.update(kinase_groups) if kinases or kinase_groups: u_proteins_covered.add(protein) from stats import Statistics statistics = Statistics() all_interactions = statistics.interactions() kinases_covered = statistics.kinases_covered() kinase_groups_covered = statistics.kinase_groups_covered() proteins_covered = statistics.proteins_covered() assert all_interactions == u_all_interactions assert kinases_covered == len(u_kinases_covered) assert kinase_groups_covered == len(u_kinase_groups_covered) assert proteins_covered == len(u_proteins_covered)
def parser(line): kinase_name, gene_name = line protein = get_preferred_gene_isoform(gene_name) if not protein: print('No isoform for %s kinase mapped to %s gene!' % (kinase_name, gene_name)) return if kinase_name in known_kinases: kinase = known_kinases[kinase_name] if kinase.protein and kinase.protein != protein: print('Overriding kinase-protein association for ' '%s kinase. Old isoform: %s; new isoform: %s.' % (kinase_name, kinase.protein.refseq, protein.refseq)) kinase.protein = protein else: new_kinases.append(Kinase(name=kinase_name, protein=protein))
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Some sample') InheritedMutation( mutation=mutation, clin_data=[ClinicalData(disease=Disease(name='Some disease'))]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases=[kinase]) protein.sites = [site] return locals()
def test_classification(self): """Following assertion about data file holds: - 'family' fits better to our 'group' than any other column - 'gene.clean', not 'Kinase' is being used as kinase name as it fits much better. """ existing_kinases = { name: Kinase(name=name) for name in ('AKT1', 'Akt2', 'CIT') } existing_groups = {name: KinaseGroup(name=name) for name in ('Akt', )} def add_to_session(): db.session.add_all(existing_kinases.values()) db.session.add_all(existing_groups.values()) filename = make_named_temp_file(raw_gene_list) add_to_session() with self.app.app_context(): new_groups = load_kinase_classification(filename) assert len(new_groups) == 1 new_group = new_groups[0] assert new_group.name == 'DMPK' add_to_session() assert len(new_group.kinases) == 2 assert existing_kinases['CIT'] in new_group.kinases old_group = existing_groups['Akt'] assert len(old_group.kinases) == 3 assert existing_kinases['AKT1'] in old_group.kinases assert existing_kinases['Akt2'] in old_group.kinases