def source_specific_proteins_with_ptm_mutations(): source_models = mutation_sources() source_models['merged'] = None proteins_with_ptm_muts = {} kinases = {} kinase_groups = {} for name, model in tqdm(source_models.items()): query = ( db.session.query(distinct(Protein.id)) .filter(Protein.has_ptm_mutations_in_dataset(model) == True) ) proteins_with_ptm_muts[name] = query.count() kinases[name] = ( db.session.query(distinct(models.Kinase.id)) .join(Protein) .filter(Protein.has_ptm_mutations_in_dataset(model) == True) ).count() kinase_groups[name] = ( db.session.query(distinct(models.KinaseGroup.id)) .join(models.Kinase) .join(Protein) .filter(Protein.has_ptm_mutations_in_dataset(model) == True) ).count() return { 'Proteins with PTM muts': proteins_with_ptm_muts, 'Kinases with PTM muts': kinases, 'Kinase groups with PTM muts': kinase_groups }
def test_select_preferred_isoform(self): proteins_data = [ ('NM_001', 'MA', False), ('NM_002', 'MAA', True), ('NM_003', 'MAAA', True), # we want this one: # canonical according to uniprot, then longest, then oldest in refseq ('NM_004', 'MAAA', True), ('NM_005', 'MAAAA', False) ] preferred_refseq = 'NM_003' gene = Gene(name='Gene X') for refseq, seq, is_uniprot_canonical in proteins_data: protein = Protein(refseq=refseq, sequence=seq, gene=gene) if is_uniprot_canonical: protein_references = ProteinReferences( uniprot_entries=[UniprotEntry(isoform=1, reviewed=True)]) protein.external_references = protein_references db.session.add(gene) isoform = select_preferred_isoform(gene) assert isoform assert isoform.refseq == preferred_refseq
def test_edge_cases_mapping(self): gene_t = Gene( name='T', isoforms=[ # 123456789 Protein(refseq='NM_01', sequence='AXAXAYAYA'), # C-terminal part was trimmed Protein(refseq='NM_02', sequence='AXAXA'), # N-terminal part was trimmed Protein(refseq='NM_03', sequence='AYAYA'), ]) db.session.add(gene_t) db.session.commit() mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'), lambda s: f'{s.position}{s.residue}') # all sites in NM_01, the idea is to test sites = DataFrame.from_dict(data={ 'site at N-terminus edge': ('T', 'NM_01', 1, '^AX', 'A', 2), 'site at C-terminus edge': ('T', 'NM_01', 9, 'YA$', 'A', 2), }, orient='index') sites.columns = [ 'gene', 'refseq', 'position', 'sequence', 'residue', 'left_sequence_offset' ] mapped_sites = mapper.map_sites_by_sequence(sites) assert len(mapped_sites) == 4
def test_sites(self): p = Protein(**test_protein_data()) sites = [ Site(position=3, residue='R', type='phosphorylation'), Site(position=4, residue='T', type='methylation') ] db.session.add(p) p.sites = sites response = self.client.get('/protein/sites/NM_000123') assert response.status_code == 200 assert response.content_type == 'application/json' assert len(response.json) == 2 phospo_site_repr = None for site_repr in response.json: if site_repr['type'] == 'phosphorylation': phospo_site_repr = site_repr assert phospo_site_repr
def test_has_sites_in_range(self): mutation_position = 100 sites_result = { (93, 107): True, (92, 108): False, (100,): True, tuple(): False, (90,): False, (110,): False, (94, 95, 96, 97, 98, 99): True, (101, 102, 103, 104, 105, 106): True, (93,): True, (107,): True } for sites_positions, expected_result in sites_result.items(): protein = Protein( sites=[ Site(position=pos) for pos in sites_positions ] ) result = protein.has_sites_in_range(mutation_position - 7, mutation_position + 7) assert result == expected_result
def test_mutation(self): p = Protein(**test_protein_data()) p.mutations = create_test_mutations() db.session.add(p) queries = { '/protein/mutation/NM_000123/1/K': 1, '/protein/mutation/NM_000123/1/K?filters=Mutation.sources:in:MC3': 1, '/protein/mutation/NM_000123/2/K': 1, } for query, expected_results_cnt in queries.items(): response = self.client.get(query) assert len(response.json) == expected_results_cnt response = self.client.get( '/protein/mutation/NM_000123/2/K?filters=Mutation.sources:in:MC3') assert 'Warning: There is a mutation, but it does not satisfy given filters' in response.json response = self.client.get('/protein/mutation/NM_000123/2/K') mut = response.json.pop() assert mut['ref'] == 'A' assert mut['pos'] == 2 assert mut['alt'] == 'K' assert mut['protein'] == 'NM_000123'
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') protein.gene.preferred_isoform = protein MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Sample A,Sample B', count=2) InheritedMutation(mutation=mutation, clin_data=[ ClinicalData(disease=Disease(name='Some disease'), sig_code=5), ClinicalData(disease=Disease(name='Other disease'), sig_code=2) ]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases={kinase}, pmid={1, 2}, types={SiteType(name='glycosylation')}) protein.sites = [site] return locals()
def test_mapping(self): gene_a = Gene( name='A', isoforms=[ # the full isoform of gene A Protein(refseq='NM_01', sequence='AAAAAAAAAXAA'), # a trimmed isoform of gene A Protein(refseq='NM_02', sequence='AAAXAA'), ]) gene_b = Gene(name='B', isoforms=[ Protein(refseq='NM_03', sequence='BBBBBBBBBYBB'), Protein(refseq='NM_04', sequence='BBBYBB'), ]) db.session.add_all([gene_a, gene_b]) # whoops, NM_03 has be accidentally removed (!) db.session.delete(Protein.query.filter_by(refseq='NM_03').one()) db.session.commit() mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'), lambda s: f'{s.position}{s.residue}') sites = DataFrame.from_dict(data={ 'good site A': ('A', 'NM_01', 10, 'AXA', 'X', 1), 'lost isoform': ('B', 'NM_03', 10, 'BYB', 'Y', 1) }, orient='index') sites.columns = [ 'gene', 'refseq', 'position', 'sequence', 'residue', 'left_sequence_offset' ] mapped_sites = mapper.map_sites_by_sequence(sites) sites_by_isoform = group_by_isoform(mapped_sites) # one from NM_01 (defined), from NM_02 (mapped), from NM_04 (mapped) assert len(mapped_sites) == 3 assert set(sites_by_isoform) == {'NM_01', 'NM_02', 'NM_04'} assert sites_by_isoform['NM_01'].residue == sites_by_isoform[ 'NM_02'].residue == 'X' assert sites_by_isoform['NM_01'].position == 10 assert sites_by_isoform['NM_02'].position == 4 assert sites_by_isoform['NM_04'].residue == 'Y' assert sites_by_isoform['NM_04'].position == 4 # will the mapping to NM_02 still work if we remove 'gene' column? sites.drop(columns=['gene'], inplace=True) mapped_sites = mapper.map_sites_by_sequence(sites) sites_by_isoform = group_by_isoform(mapped_sites) assert len(mapped_sites) == 2 assert set(sites_by_isoform) == {'NM_01', 'NM_02'}
def test_known_mutations(self): p = Protein(**test_protein_data()) p.mutations = create_test_mutations() db.session.add(p) response = self.client.get('/protein/known_mutations/NM_000123') muts = response.json assert len(muts) == 4
def test_train_model(self): phosphorylation = SiteType(name='phosphorylation') # non-phosphorylated serine residues are needed to generate negative sites p = Protein(refseq='NM_007', sequence='--------SLPA-----------SVIT-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) # phosphorylated, with sites p = Protein(refseq='NM_001', sequence='--------SPAK-----------SPAR-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) k = Kinase(name='CDK1', is_involved_in={phosphorylation}) for pos in [9, 24]: s = Site(position=pos, types={phosphorylation}, residue='S', protein=p, kinases={k}) db.session.add(s) db.session.commit() with TemporaryDirectory() as temp_dir: model = train_model(phosphorylation, sequences_dir=temp_dir, sampling_n=2, threshold=2) # the model should have one set of params - for CDK1 kinase assert len(model) == 1 cdk_params = model.rx2('CDK1') pwm = cdk_params.rx2('pwm') # and the position-specific weight matrix should be created assert pwm # the very detailed testing should be performed by rMIMP, # but why not test the basics? weights_of_central_aa = { aa: value for aa, value in zip(pwm.rownames, pwm.rx(True, 8)) } assert weights_of_central_aa['S'] == max( weights_of_central_aa.values())
def test_impact_on_ptm(self): mutations = [Mutation(position=61)] protein = Protein(refseq='NM_00001', mutations=mutations) db.session.add(protein) protein.sites = [ Site(position=61), Site(position=54), Site(position=51) ] mutation = mutations[0] assert mutation.impact_on_ptm() == 'direct'
def save_protein(request, strain_id): theStrain = get_object_or_404(Strain, pk = strain_id) protein_file = request.FILES['protein_file'] for seqRecord in SeqIO.parse(protein_file, "fasta"): protein = Protein() protein.name = seqRecord.id protein.seq = seqRecord.seq.tostring() protein.cds = CDS.objects.get(name = protein.name) protein.createdDate = datetime.datetime.now() protein.modifiedDate = datetime.datetime.now() protein.save() return HttpResponseRedirect('/strains/')
def test_browse(self): p = Protein(**test_protein_data()) db.session.add(p) response = self.client.get('/protein/browse', follow_redirects=True) assert response.status_code == 200
def create_test_data(): mappings_filename = make_named_gz_file(raw_mappings) # create proteins from first three data rows protein_data = [ ('NM_002749', 'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*' ), ('NM_139033', 'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*' ), ('NM_139034', 'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*' ), ] gene = Gene(name='MAPK7') proteins = { refseq_nm: Protein(refseq=refseq_nm, sequence=sequence, gene=gene) for refseq_nm, sequence in protein_data } # we need to have proteins with id in session - hence commit db.session.add(gene) db.session.commit() return mappings_filename, gene, proteins
def test_prepare_dataset(self): from views.mutation import prepare_datasets p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') details = MC3Mutation(mutation=mutation, count=2) db.session.add(mutation) datasets, user_datasets = prepare_datasets(mutation) expected_datasets = [{ 'filter': 'Mutation.sources:in:' + source.name, 'name': source.display_name, 'mutation_present': False } if source is not MC3Mutation else { 'filter': 'Mutation.sources:in:' + MC3Mutation.name, 'name': MC3Mutation.display_name, 'mutation_present': [details] } for source in source_manager.confirmed] assert datasets == expected_datasets assert not user_datasets
def test_import(self): protein = Protein( refseq='NM_001741', sequence= 'MGFQKFSPFLALSILVLLQAGSLHAAPFRSALESSPADPATLSEDEARLLLAALVQNYVQMKASELEQEQEREGSSLDSPRSKRCGNLSTCMLGTYTQDFNKFHTFPQTAIGVGAPGKKRDMSSDLERDHRPHVSMPQNAN*' ) db.session.add(protein) with initialized_importer(SimpleCase, 'O-GalNAc') as importer: sites = importer.load_sites(site_datasets=['O-GalNAc']) assert len(sites) == 2 sites_by_pos = {site.position: site for site in sites} assert sites_by_pos[105].residue == sites_by_pos[109].residue == 'T' assert sites_by_pos[105].types_names == {'O-glycosylation'} # check re-loading db.session.add_all(sites) db.session.commit() site = Site.query.filter_by(position=105).one() assert site.types_names == {'O-glycosylation'}
def test_search_proteins(self): from views.search import search_proteins # create 15 genes and proteins mock_proteins_and_genes(15) # control: do we start with the mocked proteins not others? assert not search_proteins('TP53') # does respect limit? does symbol search work? results = search_proteins('Gene', 10) assert results assert len(results) == 10 assert results[0].name.startswith('Gene') # are results sorted? db.session.add_all([ Gene(name=name, preferred_isoform=Protein(refseq='NM_%s' % 20 * i)) for i, name in enumerate(['TPK', 'TPKK']) ]) results = search_proteins('TPK', 2) assert results[0].name == 'TPK' assert results[0].best_score < results[1].best_score # does include both: refseq and symbol search? assert search_proteins('NM_0003', 1) # can we change subset of searched features? assert not search_proteins('NM_0003', 1, features=['gene_symbol']) assert not search_proteins('Gene', 1, features=['refseq'])
def test_sites(self): mutations = [Mutation(position=x) for x in (0, 5, 12, 57)] protein = Protein(refseq='NM_00002', mutations=mutations, sites=[Site(position=x) for x in (10, 14, 15, 57)]) db.session.add(protein) db.session.commit() # ==test_find_closest_sites== # for mutation at position 0 there is no closest site; # for mutation at position 5 there should be 1 closest site expected_closest_sites = dict(zip(mutations, [0, 1, 2, 1])) for mutation, expected_sites_cnt in expected_closest_sites.items(): sites_found = mutation.find_closest_sites() assert len(sites_found) == expected_sites_cnt # ==test_get_affected_ptm_sites== expected_affected_sites = dict(zip(mutations, [0, 1, 3, 1])) for mutation, expected_sites_cnt in expected_affected_sites.items(): sites_found = mutation.get_affected_ptm_sites() assert len(sites_found) == expected_sites_cnt
def test_types(self): methylation = SiteType(name='methylation') p = Protein(refseq='NM_007', id=1, sequence='ABCD') db.session.add(p) site = Site(position=2, types={methylation}, residue='B', protein=p) db.session.add(site) db.session.commit() query = Protein.query assert query.filter(Protein.sites.any( Site.types.contains(methylation))).one() assert not query.filter( Protein.sites.any(~Site.types.contains(methylation))).all() assert Site.query.filter(Site.types.contains(methylation)).count() == 1 assert not Site.query.filter(~Site.types.contains(methylation)).all() phosphorylation = SiteType(name='phosphorylation') assert not query.filter( Protein.sites.any(Site.types.contains(phosphorylation))).all() assert query.filter( Protein.sites.any(~Site.types.contains(phosphorylation))).one() assert Site.query.filter( Site.types.contains(phosphorylation)).count() == 0
def test_prepare_tracks(): protein = Protein(refseq='NM_01', sequence='123456789', sites=[Site(position=4)]) sequence_track = SequenceTrack(protein) site = sequence_track.subtracks[0].elements[0] assert site.start >= 0
def create_test_proteins(refseqs) -> Dict[str, Protein]: # reset cache proteins = get_proteins(reload_cache=True) for refseq in refseqs: proteins[refseq] = Protein(refseq=refseq) return proteins
def mock_proteins_and_genes(count): from database import db from models import Gene, Protein for i in range(count): g = Gene(name='Gene_%s' % i, full_name='Full name of gene %s' % i) p = Protein(refseq='NM_000%s' % i, gene=g) g.preferred_isoform = p db.session.add(g)
def test_refseq(self): from search.gene import RefseqGeneSearch # create 15 genes and proteins mock_proteins_and_genes(10) search = RefseqGeneSearch().search # negative control for phase in ['9999', 'NM_00000', 'Gene']: assert not search(phase) # limiting results = search('NM_', limit=5) assert len(results) == 5 assert results[0].name.startswith('Gene') # test the search itself for refseq in ['NM_0003', 'nm_0003', '0003']: results = search(refseq) assert len(results) == 1 assert results[0].name == 'Gene_3' isoforms = results[0].matched_isoforms assert len(isoforms) == 1 assert isoforms.pop().refseq == 'NM_0003' db.session.add_all([ Gene(name='Gene X', isoforms=[ Protein(refseq='NM_000301'), Protein(refseq='NM_000302'), ]), Gene(name='Gene Y', isoforms=[Protein(refseq='NM_000309')]) ]) # so there are three genes with isoforms starting with NM_0003 # (those are Gene_3, Gene X, Gene Y). Let see if limiting work # well when applied per-gene. queries = {'NM_0003': 2, 'NM_00030': 2, 'NM_000301': 1, 'NM_000302': 1} for query, expected_result in queries.items(): assert len(search(query, limit=2)) == expected_result
def test_domains(self): proteins = [ Protein( refseq='NM_018163', sequence= 'MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDNPRAAELFHQLSQALEVLTDAAARAAYDKVRKAKKQAAERTQKLDEKRKKVKLDLEARERQAQAQESEEEEESRSTRTLEQEIERLREEGSRQLEEQQRLIREQIRQERDQRLRGKAENTEGQGTPKLKLKWKCKKEDESKGGYSKDVLLRLLQKYGEVLNLVLSSKKPGTAVVEFATVKAAELAVQNEVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMMRMRQAAERQQLIARMQQEDQEGPPT*', gene=Gene(chrom='15')), Protein( refseq='NM_004671', sequence= 'MADFEELRNMVSSFRVSELQVLLGFAGRNKSGRKHDLLMRALHLLKSGCSPAVQIKIRELYRRRYPRTLEGLSDLSTIKSSVFSLDGGSSPVEPDLAVAGIHSLPSTSVTPHSPSSPVGSVLLQDTKPTFEMQQPSPPIPPVHPDVQLKNLPFYDVLDVLIKPTSLVQSSIQRFQEKFFIFALTPQQVREICISRDFLPGGRRDYTVQVQLRLCLAETSCPQEDNYPNSLCIKVNGKLFPLPGYAPPPKNGIEQKRPGRPLNITSLVRLSSAVPNQISISWASEIGKNYSMSVYLVRQLTSAMLLQRLKMKGIRNPDHSRALIKEKLTADPDSEIATTSLRVSLMCPLGKMRLTIPCRAVTCTHLQCFDAALYLQMNEKKPTWICPVCDKKAAYESLILDGLFMEILNDCSDVDEIKFQEDGSWCPMRPKKEAMKVSSQPCTKIESSSVLSKPCSVTVASEASKKKVDVIDLTIESSSDEEEDPPAKRKCIFMSETQSSPTKGVLMYQPSSVRVPSVTSVDPAAIPPSLTDYSVPFHHTPISSMSSDLPGLDFLSLIPVDPQYCPPMFLDSLTSPLTASSTSVTTTSSHESSTHVSSSSSRSETGVITSSGSNIPDIISLD*', gene=Gene(chrom='18')) ] db.session.add_all(proteins) filename = make_named_temp_file(domains_data) new_domains = load_domains(filename) assert len(new_domains) == 6 assert len(proteins[0].domains) == 2 domains = defaultdict(list) for domain in proteins[1].domains: domains[domain.interpro.short_description].append(domain) def assert_ranges(domain, start, end): assert domain.start == start and domain.end == end # two SAP domains should be merged for representation purposes due to similarity criteria # (these two domain annotation overlap so the smaller one is contained in the bigger) sap_domain = domains['SAP_dom'][0] assert_ranges(sap_domain, 1, 65) intepro_domain = sap_domain.interpro assert intepro_domain.accession == 'IPR003034' assert intepro_domain.description == 'SAP domain' # here the two annotations overlap with more than 75% of common assert_ranges(domains['PINIT'][0], 141, 299) # and here overlap was too small to merge those domains assert len(domains['Znf_MIZ']) == 2
def make_test_shared_proteins(): proteins = [ Protein( refseq='NM_004318', sequence= 'MAQRKNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRKGGLSGTSFFTWFMVIALLGVWTSVAVVWFDLVDYEEVLGKLGIYDADGDGDFDVDDAKVLLGLKERSTSEPAVPPEEAEPHTEPEEQVPVEAEPQNIEDEAKEQIQSLLHEMVHAEHVEGEDLQQEDGPTGEPQQEDDEFLMATDVDDRFETLEPEVSHEETEHSYHVEETVSQDCNQDMEEMMSEQENPDSSEPVVEDERLHHDTDDVTYQVYEEQAVYEPLENEGIEITEVTAPPEDNPVEDSQVIVEEVSIFPVEEQQEVPPETNRKTDDPEQKAKVKKKKPKLLNKFDKTIKAELDAAEKLRKRGKIEEAVNAFKELVRKYPQSPRARYGKAQCEDDLAEKRRSNEVLRGAIETYQEVASLPDVPADLLKLSLKRRSDRQQFLGHMRGSLLTLQRLVQLFPNDTSLKNDLGVGYLLIGDNDNAKKVYEEVLSVTPNDGFAKVHYGFILKAQNKIAESIPYLKEGIESGDPGTDDGRFYFHLGDAMQRVGNKEAYKWYELGHKRGHFASVWQRSLYNVNGLKAQPWWTPKETGYTELVKSLERNWKLIRDEGLAVMDKAKGLFLPEDENLREKGDWSQFTLWQQGRRNENACKGAPKTCTLLEKFPETTGCRRGQIKYSIMHPGTHVWPHTGPTNCRLRMHLGLVIPKEGCKIRCANETKTWEEGKVLIFDDSFEHEVWQDASSFRLIFIVDVWHPELTPQQRRSLPAI*' ), Protein( refseq='NM_020164', sequence= 'MAEDKETKHGGHKNGRKGGLSGTSFFTWFMVIALLGVWTSVAVVWFDLVDYEEVLAKAKDFRYNLSEVLQGKLGIYDADGDGDFDVDDAKVLLEGPSGVAKRKTKAKVKELTKEELKKEKEKPESRKESKNEERKKGKKEDVRKDKKIADADLSRKESPKGKKDREKEKVDLEKSAKTKENRKKSTNMKDVSSKMASRDKDDRKESRSSTRYAHLTKGNTQKRNG*' ) ] db.session.add_all(proteins) db.session.commit()
def create_test_kinase(name, refseq): interactor = Kinase(name=name) kinase_gene = Gene(name='Gene of ' + interactor.name) kinase_protein = Protein(refseq=refseq, gene=kinase_gene) interactor.protein = kinase_protein return interactor
def test_search_mutations(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV') m_in_site = Mutation(protein=p, position=13, alt='V') m_out_site = Mutation(protein=p, position=50, alt='K') db.session.add(p) # points to the same location as first record in VCF_FILE_CONTENT test_query = 'chr20 14370 G A' from database import bdb # map the first genomic mutation from VCF_FILE_CONTENT # to some (mocked) protein mutation bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True) # # basic test - is appropriate mutation in results? # response = self.search_mutations(mutations=test_query) assert response.status_code == 200 # this mutation is exactly at a PTM site and should be included in results assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data # this mutation lies outside of a PTM site - be default should be filtered out assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data # # count test - is mutation for this query annotated as shown twice? # response = self.search_mutations( mutations='{0}\n{0}'.format(test_query) ) assert response.status_code == 200 assert b'<td>2</td>' in response.data # # VCF file test # response = self.client.post( '/search/mutations', content_type='multipart/form-data', data={ 'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf') } ) assert response.status_code == 200 assert b'NM_007' in response.data
def test_trim_ends(): # protein sequences are 1-based track = SequenceTrack(Protein(sequence='1234567890')) element = [-5, 10] # track.trim_ends([element]) assert element == [1, 4] element = [5, 10] # 567890---- track.trim_ends([element]) assert element == [5, 6] # should include 0
def test_redirect(self): p = Protein(**test_protein_data()) db.session.add(p) response = self.client.get( '/protein/show/NM_000123?filters=Mutation.sources:in:MC3') assert response.status_code == 302 assert relative_location( response ) == '/sequence/show/NM_000123?filters=Mutation.sources:in:MC3'
def test_map_site_to_isoform(self): mapper = SiteMapper([], lambda s: f'{s.position}{s.sequence}') site = RawSite(sequence='FIN', position=6, left_sequence_offset=1) protein = Protein(sequence='LKIQYTKIFINNEWHDSVSG') assert mapper.map_site_to_isoform(site, protein) == [10] with warns(UserWarning, match='More than one match for: 2KI'): site = RawSite(sequence='KI', position=2, left_sequence_offset=0) assert mapper.map_site_to_isoform(site, protein) == [2, 7]
def loadProtein(request): f = open('C:/Users/anna/Desktop/Doktorat/typeii/src/typeii/sourceData/nonsystemalso_new.csv') proteins = [] headers = 2 '''#uncomment to use test mode (upload only 5 records) temp = 40#test''' # possibly needed to indicate protein affiliation #genomes = Genome.objects.all() #pieces = DNAPiece.objects.all() systems = System.objects.all() for line in f: data = line.split(',') gene_id = data[0].replace('"', '') if gene_id in proteins or headers != 0: headers -= 1 pass else: system = int(data[4].replace('"', '')) genome_location = data[8].replace('"', '') if data[9].replace('"', '') == '+': strand = '+' else: strand = '-' margin_left = int(data[10].replace('"', '')) system_part = data[11].replace('"', '') clans_cluster = '' #fixed value till data available hammer_cluster = data[12].replace('"', '') subunit_kind = data[13].replace('"', '') dna_length = int(data[15].replace('"', '')) aa_sequence = data[17].replace('"', '') hh_pfam_id = data[18].replace('"', '') hh_pfam_short_desc = data[19].replace('"', '') hh_probability_raw = data[20].replace('"','') if hh_probability_raw != '': hh_probability = decimal.Decimal(hh_probability_raw) else: hh_probability = decimal.Decimal(0.0) #default for records without hh value hh_probability.quantize(decimal.Decimal('.01')) hh_e_value = data[21].replace('"', '') hh_pfam_desc = data[22].replace('"', '') m_probability = 0 #fixed value till data from Vilno r_probability = 0 #fixed value till data from Vilno s_probability = 0 #fixed value till data from Vilno print (gene_id) proteins.append(gene_id) for record in systems: if record.id == system: #uncomment for verbose upload print(record.id, gene_id + ' ' + genome_location + ' ' + strand, margin_left, system_part + ' ' + clans_cluster + ' ' + hammer_cluster + ' ' + subunit_kind, dna_length, aa_sequence + ' ' + hh_pfam_id + ' ' + hh_pfam_short_desc, hh_probability, hh_e_value + ' ' + hh_pfam_desc, m_probability, r_probability, s_probability) p = Protein(system=record, gene_id=gene_id, genome_location=genome_location, strand=strand, margin_left=margin_left, system_part=system_part, clans_cluster=clans_cluster, hammer_cluster=hammer_cluster, subunit_kind=subunit_kind, dna_length=dna_length, aa_sequence=aa_sequence, hh_pfam_id=hh_pfam_id, hh_pfam_short_desc=hh_pfam_short_desc, hh_probability=hh_probability, hh_e_value=hh_e_value, hh_pfam_desc=hh_pfam_desc, m_probability=m_probability, r_probability=r_probability, s_probability=s_probability) p.save() print('Protein appended.') '''#uncomment to use test mode (upload only 5 records) temp -= 1#test if temp == 0:#test break#test''' return HttpResponse('Download complete.')