def test_mutations_export(self): mc3_filename = make_named_temp_file() clinvar_filename = make_named_temp_file() with self.app.app_context(): test_models = create_test_models() db.session.add_all(test_models.values()) protein = test_models['protein'] muts_import_manager.perform('export', [protein], ['mc3'], {'mc3': mc3_filename}) muts_import_manager.perform('export', [protein], ['clinvar'], {'clinvar': clinvar_filename}) with gzip.open(mc3_filename) as f: assert f.readlines() == [ b'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN' ] with gzip.open(clinvar_filename) as f: assert f.readlines() == [ b'gene\tisoform\tposition\twt_residue\tmut_residue\tdisease\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tSome disease' ]
def test_exceptions(self): sites_data = ( # this odd case is real: '02098 NONO 02098_1 NP_031389.3 0 Y - - Acetylation in vivo 19608861' # there are 9 such cases in HPRD at the time of this test creation ) mappings = '02098 NONO NM_007363.4 NP_031389.3 4841 300084 Q15233,B7Z4C2 Non pou domain containing octamer binding protein' sequences = ( '>02098|02098_1|NP_031389.3|Non pou domain containing octamer binding protein\n' 'MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQQASSQNEGLTIDLKNFRKPGEKTFTQRSRLFVG' # the main part of the sequence was cut out as it is not needed # but this is the important bit: it has 'Y' at the end: 'GTLGLTPPTTERFGQAATMEGIGAIGGTPPAFNRAAPGAEFAPNKRRRY' # so having 1-based positioning system, after a naive conversion to 0-based: # site.pos = -1; furthermore, sequence[site.pos] == 'Y' (!); this is probably # why the pos = '0' had been saved in HPRD in the first place. ) protein = Protein( refseq='NM_007363', sequence= 'MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQQASSQNEGLTIDLKNFRKPGEKTFTQRSRLFVG' 'GTLGLTPPTTERFGQAATMEGIGAIGGTPPAFNRAAPGAEFAPNKRRRY') db.session.add(protein) importer = HPRDImporter(make_named_temp_file(sequences), make_named_temp_file(mappings), dir_path='') # without a fix, it should warn and reject the faulty site with warns( UserWarning, match='The site: 02098_1: 0Y is outside of the protein sequence' ): sites = importer.load_sites(path=make_named_temp_file(sites_data), pos_zero_means_last_aa=False) assert len(sites) == 0 # and it should work when a workaround is applied with warns(None) as warnings: sites = importer.load_sites(path=make_named_temp_file(sites_data), pos_zero_means_last_aa=True) assert all(warning.category is not UserWarning for warning in warnings.list) assert len(sites) == 1 site = sites[0] assert site.position == 128 assert site.residue == 'Y'
def test_export_paths(self): name_1 = make_named_temp_file() name_2 = make_named_temp_file() # user gave too many paths msg, error = self.run_command('export protein_related -e sites_ac --paths %s %s' % (name_1, name_2)) assert 'Export paths should be given for every exported file, no less, no more.' in msg # user gave good number of paths msg, error = self.run_command('export protein_related -e sites_ac --paths %s' % name_1) assert 'Export paths should be given for every exported file, no less, no more.' not in msg assert ('Exported sites_ac to %s' % name_1) in msg
def test_mutations_export(self): cases = (('mc3', {}, [ b'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\tcount\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\t2' ]), ('mc3', { 'export_samples': True }, [ b'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\tsample_id\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample A\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample B' ]), ('clinvar', {}, [ b'gene\tisoform\tposition\twt_residue\tmut_residue\tdisease\tsignificance\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tSome disease\tPathogenic\n', b'SOMEGENE\tNM_0001\t1\tA\tE\tOther disease\tBenign' ])) with self.app.app_context(): test_models = create_test_models() db.session.add_all(test_models.values()) protein = test_models['protein'] for source, kwargs, expected_lines in cases: filename = make_named_temp_file() muts_import_manager.perform('export', [protein], [source], paths={source: filename}, **kwargs) with gzip.open(filename) as f: assert f.readlines() == expected_lines
def test_others_import(self): make_test_shared_proteins() importer = OthersUniprotImporter(*OtherSitesData.files_for_init()) sites = importer.load_sites( path=make_named_temp_file(OtherSitesData.sites)) assert len(sites) == 1 # test ECO filtering sites = importer.load_sites(path=make_named_temp_file( # any other code should cause the entry to be ignored OtherSitesData.sites.replace('ECO_0000269', 'ECO_0000200'))) assert len(sites) == 0
def test_gene_full_name(self): gene = Gene(name='TP53', entrez_id=7157) db.session.add(gene) filename = make_named_temp_file(full_gene_names, mode='wt', opener=gzip.open) load_full_gene_names(filename) assert gene.full_name == 'tumor protein p53'
def test_disorder(self): proteins = create_test_proteins(['NM_002749', 'NM_000600']) filename = make_named_temp_file(disorder_data) with self.app.app_context(): load_disorder(filename) assert proteins['NM_002749'].disorder_map == '111111111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000' assert proteins['NM_000600'].disorder_map == '11111111111101000000000000000011111111111111100000000000000000000000000000000000000000000000'
def test_sequences(self): proteins = create_test_proteins(['NM_002749', 'NM_021806', 'NM_001204289']) filename = make_named_temp_file(fasta_sequences) with self.app.app_context(): load_sequences(filename) protein = proteins['NM_021806'] assert protein.sequence == 'MRLAGPLRIVVLVVSVGVTWIVVSILLGGPGSGFPRIQQLFTSPESSVTAAPRARKYKCGLPQPCPEEHLAFRVVSGAANVIGPKICLEDKMLMSSVKDNVGRGLNIALVNGVSGELIEARAFDMWAGDVNDLLKFIRPLHEGTLVFVASYDDPATKMNEETRKLFSELGSRNAKELAFRDSWVFVGAKGVQNKSPFEQHVKNSKHSNKYEGWPEALEMEGCIPRRSTAS*' assert protein.length == len('MRLAGPLRIVVLVVSVGVTWIVVSILLGGPGSGFPRIQQLFTSPESSVTAAPRARKYKCGLPQPCPEEHLAFRVVSGAANVIGPKICLEDKMLMSSVKDNVGRGLNIALVNGVSGELIEARAFDMWAGDVNDLLKFIRPLHEGTLVFVASYDDPATKMNEETRKLFSELGSRNAKELAFRDSWVFVGAKGVQNKSPFEQHVKNSKHSNKYEGWPEALEMEGCIPRRSTAS')
def test_splice_variants_handling(self): """Verify import of sites from a multi-splice variants entry (here: Q12797)""" make_test_shared_proteins() importer = GlycosylationUniprotImporter( *SpliceVariantData.files_for_init()) sites = importer.load_sites( path=make_named_temp_file(SpliceVariantData.sites)) assert len(sites) == 3
def test_protein_summaries(self): proteins = create_test_proteins(['NM_010410', 'NM_182751']) filename = make_named_temp_file(summaries_data, mode='wt', opener=gzip.open) with self.app.app_context(): protein_summaries(path=filename) assert proteins['NM_010410'].summary == 'This gene encodes a hypothalamic neuropeptide precursor [...]' assert proteins['NM_182751'].summary == 'The protein encoded by this gene is one of the highly [...]'
def test_proteins_and_genes(self): create_test_proteins([]) filename = make_named_temp_file(protein_data) with self.app.app_context(): new_proteins = proteins_and_genes(path=filename) assert len(new_proteins) == 4 db.session.add_all(new_proteins) p = Protein.query.filter_by(refseq='NM_002749').one() g = Gene.query.filter_by(name='MAPK7').one() assert p.gene == g assert p.tx_start == 19281773 assert p.tx_end == 19286857 assert p.cds_start == 19282213 assert p.cds_end == 19286544 # test genes genes = Gene.query.all() assert len(genes) == 4 # test strands: assert g.strand is True assert Gene.query.filter_by(name='MUC1').one().strand is False second_filename = make_named_temp_file(update_data) with self.app.app_context(): new_proteins = proteins_and_genes(path=second_filename) assert len(new_proteins) == 1 protein = list(new_proteins)[0] assert protein.refseq == 'NM_182962'
def test_disorder(self): proteins = create_test_proteins(['NM_002749', 'NM_000600']) for refseq, protein in proteins.items(): protein.sequence = sequences_for_proteins[refseq] filename = make_named_temp_file(disorder_data) with self.app.app_context(): load_disorder(filename) assert proteins[ 'NM_002749'].disorder_map == '111111111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000' assert proteins[ 'NM_000600'].disorder_map == '11111111111101000000000000000011111111111111100000000000000000000000000000000000000000000000'
def test_sites(self): proteins = create_test_proteins(['NM_003955']) # Sequence is needed for validation. Validation is tested on model level. proteins['NM_003955'].sequence = 'MVTHSKFPAAGMSRPLDTSLRLKTFSSKSEYQLVVNAVRKLQESGFYWSAVTGGEANLLLSAEPAGTFLIRDSSDQRHFFTLSVKTQSGTKNLRIQCEGGSFSLQSDPRSTQPVPRFDCVLKLVHHYMPPPGAPSFPSPPTEPSSEVPEQPSAQPLPGSPPRRAYYIYSGGEKIPLVLSRPLSSNVATLQHLCRKTVNGHLDSYEKVTQLPGPIREFLDQYDAPL*' filename = make_named_temp_file(sites_data) sites = load_sites(filename) assert len(sites) == 3 sites = {site.position: site for site in sites} assert sites[6].residue == 'K' assert sites[6].type == 'ubiquitination' assert {kinase.name for kinase in sites[204].kinases} == {'JAK2', 'LCK'}
def test_sites_export(self): filename = make_named_temp_file() with self.app.app_context(): test_models = create_test_models() db.session.add_all(test_models.values()) namespace = Namespace(exporters=['sites_ac'], paths=[filename]) ProteinRelated().export(namespace) with open(filename) as f: assert f.readlines() == [ 'gene\tposition\tresidue\ttype\tkinase\tpmid\n', 'SOMEGENE\t1\tA\tglycosylation\tKinase name\t1,2\n' ]
def test_cancer(self): filename = make_named_temp_file(cancers_list) with self.app.app_context(): cancers = load_cancers(path=filename) # two cancers should be returned assert len(cancers) == 2 cancer = cancers[0] assert cancer.name == 'Bladder Urothelial Carcinoma' assert cancer.code == 'BLCA' db.session.add_all(cancers) assert True
def test_ptm_mutations(self): filename = make_named_temp_file() with self.app.app_context(): test_models = create_test_models() db.session.add_all(test_models.values()) namespace = Namespace(exporters=['mc3_muts_affecting_ptm_sites'], paths=[filename]) ProteinRelated.export(namespace) with open(filename) as f: assert f.readlines() == [ 'gene refseq mutation position mutation alt mutation summary site position site residue\n', 'SOMEGENE\tNM_0001\t1\tE\tCAN\t1\tA\n' ]
def test_domains(self): proteins = [ Protein( refseq='NM_018163', sequence= 'MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDNPRAAELFHQLSQALEVLTDAAARAAYDKVRKAKKQAAERTQKLDEKRKKVKLDLEARERQAQAQESEEEEESRSTRTLEQEIERLREEGSRQLEEQQRLIREQIRQERDQRLRGKAENTEGQGTPKLKLKWKCKKEDESKGGYSKDVLLRLLQKYGEVLNLVLSSKKPGTAVVEFATVKAAELAVQNEVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMMRMRQAAERQQLIARMQQEDQEGPPT*', gene=Gene(chrom='15')), Protein( refseq='NM_004671', sequence= 'MADFEELRNMVSSFRVSELQVLLGFAGRNKSGRKHDLLMRALHLLKSGCSPAVQIKIRELYRRRYPRTLEGLSDLSTIKSSVFSLDGGSSPVEPDLAVAGIHSLPSTSVTPHSPSSPVGSVLLQDTKPTFEMQQPSPPIPPVHPDVQLKNLPFYDVLDVLIKPTSLVQSSIQRFQEKFFIFALTPQQVREICISRDFLPGGRRDYTVQVQLRLCLAETSCPQEDNYPNSLCIKVNGKLFPLPGYAPPPKNGIEQKRPGRPLNITSLVRLSSAVPNQISISWASEIGKNYSMSVYLVRQLTSAMLLQRLKMKGIRNPDHSRALIKEKLTADPDSEIATTSLRVSLMCPLGKMRLTIPCRAVTCTHLQCFDAALYLQMNEKKPTWICPVCDKKAAYESLILDGLFMEILNDCSDVDEIKFQEDGSWCPMRPKKEAMKVSSQPCTKIESSSVLSKPCSVTVASEASKKKVDVIDLTIESSSDEEEDPPAKRKCIFMSETQSSPTKGVLMYQPSSVRVPSVTSVDPAAIPPSLTDYSVPFHHTPISSMSSDLPGLDFLSLIPVDPQYCPPMFLDSLTSPLTASSTSVTTTSSHESSTHVSSSSSRSETGVITSSGSNIPDIISLD*', gene=Gene(chrom='18')) ] db.session.add_all(proteins) filename = make_named_temp_file(domains_data) new_domains = load_domains(filename) assert len(new_domains) == 6 assert len(proteins[0].domains) == 2 domains = defaultdict(list) for domain in proteins[1].domains: domains[domain.interpro.short_description].append(domain) def assert_ranges(domain, start, end): assert domain.start == start and domain.end == end # two SAP domains should be merged for representation purposes due to similarity criteria # (these two domain annotation overlap so the smaller one is contained in the bigger) sap_domain = domains['SAP_dom'][0] assert_ranges(sap_domain, 1, 65) intepro_domain = sap_domain.interpro assert intepro_domain.accession == 'IPR003034' assert intepro_domain.description == 'SAP domain' # here the two annotations overlap with more than 75% of common assert_ranges(domains['PINIT'][0], 141, 299) # and here overlap was too small to merge those domains assert len(domains['Znf_MIZ']) == 2
def test_missing_sequence(self): """Sometimes (though rarely) there is no sequence for given accession. This happens when UniProt fasta files are not in sync with SPRQL API. """ site_with_missing_sequence = ( 'primary_accession,sequence_accession,position,data,eco,source\n' '"B2RDS2","B2RDS2-1","79^","N-linked (GlcNAc...) asparagine","ECO_0000269",' ) importer = GlycosylationUniprotImporter( make_named_gz_file(''), make_named_gz_file(''), make_named_gz_file('B2RDS2 RefSeq_NT NM_004823.1')) with warns(UserWarning, match='No sequence for .* found!'): importer.load_sites( make_named_temp_file(site_with_missing_sequence))
def test_glycosylation_import(self): # P01891 is not mappable to refseq, we should be warned about that proteins = create_test_proteins( ['NM_001163941', 'NM_005514', 'NM_178559']) # sequence is needed for validation. Validation is tested on model level. sequences = { 'NM_001163941': 'MENSERAEEMQENYQRNGTAEEQPKLRKEAVGSIEIFRFADGLDITLMILGILASLVNGACLPLMPLVLGEMSDNLISGCLVQTNTTNYQNCTQSQEKLNEDMTLLTLYYVGIGVAALIFGYIQISLWIITAARQTKRIRKQFFHSVLAQDIGWFDSCDIGELNTRMTDDIDKISDGIGDKIALLFQNMSTFSIGLAVGLVKGWKLTLVTLSTSPLIMASAAACSRMVISLTSKELSAYSKAGAVAEEVLSSIRTVIAFRAQEKELQRYTQNLKDAKDFGIKRTIASKVSLGAVYFFMNGTYGLAFWYGTSLILNGEPGYTIGTVLAVFFSVIHSSYCIGAAVPHFETFAIARGAAFHIFQVIDKKPSIDNFSTAGYKPESIEGTVEFKNVSFNYPSRPSIKILKGLNLRIKSGETVALVGLNGSGKSTVVQLLQRLYDPDDGFIMVDENDIRALNVRHYRDHIGVVSQEPVLFGTTISNNIKYGRDDVTDEEMERAAREANAYDFIMEFPNKFNTLVGEKGAQMSGGQKQRIAIARALVRNPKILILDEATSALDSESKSAVQAALEKASKGRTTIVVAHRLSTIRSADLIVTLKDGMLAEKGAHAELMAKRGLYYSLVMSQDIKKADEQMESMTYSTERKTNSLPLHSVKSIKSDFIDKAEESTQSKEISLPEVSLLKILKLNKPEWPFVVLGTLASVLNGTVHPVFSIIFAKIITMFGNNDKTTLKHDAEIYSMIFVILGVICFVSYFMQGLFYGRAGEILTMRLRHLAFKAMLYQDIAWFDEKENSTGGLTTILAIDIAQIQGATGSRIGVLTQNATNMGLSVIISFIYGWEMTFLILSIAPVLAVTGMIETAAMTGFANKDKQELKHAGKIATEALENIRTIVSLTREKAFEQMYEEMLQTQHRNTSKKAQIIGSCYAFSHAFIYFAYAAGFRFGAYLIQAGRMTPEGMFIVFTAIAYGAMAIGETLVLAPEYSKAKSGAAHLFALLEKKPNIDSRSQEGKKPDTCEGNLEFREVSFFYPCRPDVFILRGLSLSIERGKTVAFVGSSGCGKSTSVQLLQRLYDPVQGQVLFDGVDAKELNVQWLRSQIAIVPQEPVLFNCSIAENIAYGDNSRVVPLDEIKEAANAANIHSFIEGLPEKYNTQVGLKGAQLSGGQKQRLAIARALLQKPKILLLDEATSALDNDSEKVVQHALDKARTGRTCLVVTHRLSAIQNADLIVVLHNGKIKEQGTHQELLRNRDIYFKLVNAQSVQ*', 'NM_178559': 'MVDENDIRALNVRHYRDHIGVVSQEPVLFGTTISNNIKYGRDDVTDEEMERAAREANAYDFIMEFPNKFNTLVGEKGAQMSGGQKQRIAIARALVRNPKILILDEATSALDSESKSAVQAALEKASKGRTTIVVAHRLSTIRSADLIVTLKDGMLAEKGAHAELMAKRGLYYSLVMSQDIKKADEQMESMTYSTERKTNSLPLHSVKSIKSDFIDKAEESTQSKEISLPEVSLLKILKLNKPEWPFVVLGTLASVLNGTVHPVFSIIFAKIITMFGNNDKTTLKHDAEIYSMIFVILGVICFVSYFMQGLFYGRAGEILTMRLRHLAFKAMLYQDIAWFDEKENSTGGLTTILAIDIAQIQGATGSRIGVLTQNATNMGLSVIISFIYGWEMTFLILSIAPVLAVTGMIETAAMTGFANKDKQELKHAGKIATEALENIRTIVSLTREKAFEQMYEEMLQTQHRNTSKKAQIIGSCYAFSHAFIYFAYAAGFRFGAYLIQAGRMTPEGMFIVFTAIAYGAMAIGETLVLAPEYSKAKSGAAHLFALLEKKPNIDSRSQEGKKPDTCEGNLEFREVSFFYPCRPDVFILRGLSLSIERGKTVAFVGSSGCGKSTSVQLLQRLYDPVQGQVLFDGVDAKELNVQWLRSQIAIVPQEPVLFNCSIAENIAYGDNSRVVPLDEIKEAANAANIHSFIEGLPEKYNTQVGLKGAQLSGGQKQRLAIARALLQKPKILLLDEATSALDNDSEKVVQHALDKARTGRTCLVVTHRLSAIQNADLIVVLHNGKIKEQGTHQELLRNRDIYFKLVNAQSVQ*', 'NM_005514': 'MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREEPRAPWIEQEGPEYWDRNTQIYKAQAQTDRESLRNLRGYYNQSEAGSHTLQSMYGCDVGPDGRLLRGHDQYAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGECVEWLRRYLENGKDKLERADPPKTHVTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEPSSQSTVPIVGIVAGLAVLAVVVIGAVVAAVMCRRKSSGGKGGSYSQAACSDSAQGSDVSLTA' } for isoform, sequence in sequences.items(): proteins[isoform].sequence = sequence db.session.add_all(proteins.values()) # Add gene to test cross-isoform mapping abcb5 = gene_from_isoforms(proteins, ['NM_001163941', 'NM_178559']) db.session.add(abcb5) importer = GlycosylationUniprotImporter( make_named_gz_file(GlycosylationData.canonical), make_named_gz_file(GlycosylationData.alternative), make_named_gz_file(GlycosylationData.mappings)) assert len(importer.mappings) == 3 sites = importer.load_sites( path=make_named_temp_file(GlycosylationData.sites)) # should have 2 pre-defined sites (3 but one without refseq equivalent) and one mapped (isoform NM_178559) assert len(sites) == 2 + 1 db.session.add_all(sites) db.session.flush() sites_by_isoform = {site.protein.refseq: site for site in sites} assert sites_by_isoform['NM_001163941'].residue == sites_by_isoform[ 'NM_178559'].residue == 'N'
def test_browse_list(self): from miscellaneous import make_named_temp_file from test_imports.test_gene_list import raw_gene_list from imports.protein_data import active_driver_gene_lists as load_active_driver_gene_lists filename = make_named_temp_file(raw_gene_list) # create gene list and genes with self.app.app_context(): from imports.protein_data import ListData gene_lists = load_active_driver_gene_lists(lists=( ListData(name='TCGA', path=filename, mutations_source=TCGAMutation), )) db.session.add_all(gene_lists) # create preferred isoforms for genes for i, gene in enumerate(Gene.query.all()): # at least one mutation is required for gene on a gene list to be displayed mut = Mutation() MC3Mutation(mutation=mut) p = Protein(refseq='NM_000%s' % i, mutations=[mut]) gene.isoforms = [p] gene.preferred_isoform = p # check the static template response = self.client.get('/gene/list/TCGA') assert response.status_code == 200 assert b'TCGA' in response.data # check the dynamic data response = self.client.get('/gene/list_data/TCGA?order=asc') assert response.status_code == 200 gene_list = GeneList.query.filter_by(name='TCGA').one() # all results retrieved assert response.json['total'] == len(gene_list.entries) # properly sorted by fdr fdrs = [row['fdr'] for row in response.json['rows']] assert fdrs == sorted(fdrs)
def test_conservation(self): proteins = create_test_proteins(['NM_002749', 'NM_000600']) proteins['NM_002749'].gene = Gene(name='MAPK7', chrom='17') proteins['NM_000600'].gene = Gene(name='IL6', chrom='7') for refseq, protein in proteins.items(): protein.sequence = sequences_for_proteins[refseq] # Generated with: """ import pyBigWig test_data = test_data.loc[('chr17', 'NM_002749')].iloc[0] full_bw = pyBigWig.open('data/hg19.100way.phyloP100way.bw') test_bw = pyBigWig.open('chr17_NM_002749.test_data.bw', 'w') needed_chrom_length = max(max(test_data.exonStarts), max(test_data.exonEnds)) test_bw.addHeader([('chr17', needed_chrom_length)]) coordinates = sorted(zip(test_data.exonStarts, test_data.exonEnds), key=lambda x: x[0]) for start, end in zip(test_data.exonStarts, test_data.exonEnds): values = full_bw.values('chr17', start, end) for i, value in enumerate(values): test_bw.addEntries(['chr17'], starts=[start + i], ends=[start + i + 1], values=[value]) test_bw.close() """ conservation_big_wig = 'tests/test_imports/chr17_NM_002749.test_data.bw' gene_coordinates = make_named_temp_file(coordinates_data) load_conservation(conservation_big_wig, gene_coordinates) # Protein.query.filter_by(refseq='NM_002749').one().conservation assert proteins[ 'NM_002749'].conservation == '3.47;1.67;2.95;1.23;.9;1.64;4.08;1.72;1.25;1.15;2.26;1.69;1.03;1.05;2.39;1.52;2.88;.5;2.28;-.09;2.24;1.32;-.06;.59;1.2;-.13;.37;.76;.04;1.26;-.2;1.84;.97;2.95;5.67;3.98;2.91;4.5;2.33;3.17;4.66;3.73;4.24;4.01;2.49;4.35;5.29;2.95;4.67;5.66;5.36;5.25;4.23;4.87;5.31;5.18;3.52;4.53;5.59;3.55;5.05;6.65;2.01;5.75;5.06;4.88;5.9;4.9;5.63;3.32;3.52;5.09;4.04;4.24;2.24;1.84;2.13;6.12;6.54;2.75;6.08;5.79;5.7;8.26;6.81;5.82;3.83;4.47;3.08;5.11;5;3.4;2.52;4.67;3.97;3.58;4.87;3.72;5.33;4.48;3.92;6.79;3.7;7.52;5.46;4.05;3.07;4.02;5.47;5.31;5.12;7.11;6.59;6.09;5.37;4.73;5.6;6.3;6.52;5.96;3.32;2.84;4.06;.53;3.5;3.79;3.11;1.81;4.51;4.81;3.35;4.29;5.34;5.31;4.63;6.41;2.39;6.46;3.16;8.2;6.23;2.67;6.63;2.39;4.07;5;3.49;4.71;4.16;3.27;.7;6.43;1.1;3.25;2.31;1.57;6.39;5.62;2.68;4.33;6.16;6;3.28;6.16;6.66;3.66;3.77;4.11;6.52;3.69;6.55;5.97;5.28;5.9;3.63;6.16;5.36;5.92;4.24;6.23;4.27;7.07;5.64;6.62;5.32;1.79;6.59;3.26;4.38;3.92;6.2;3.59;5.22;5.76;6.77;4.58;7.52;4.24;3.91;5.75;5.28;6.51;7.43;5.49;2.98;5.48;2.01;2.62;2.19;1.95;1.01;2.03;5.55;1.47;2.2;2.38;5.52;5.93;3.39;6.62;5.51;6.67;5.67;3.9;6.03;8.99;6.67;4.17;4.7;4.01;6.27;3.7;5.06;3.66;2.06;2.64;3.76;2.56;5.94;3.82;2.89;4.05;5.17;6.75;3.05;8.99;4.25;5.57;6.66;6.22;6.06;3.65;4.13;6.66;8.76;2.77;2.34;1.65;1.84;3.91;3.38;5.3;3.49;6.22;3.21;3.91;5.43;2.8;4.28;6.44;3.25;3.89;2.56;5.75;4.58;1.02;5.09;3.53;5.75;5;3.97;3.24;2.08;.17;3.51;2.96;1.66;2.99;3.71;4.75;3.78;5.8;3.71;5.3;4.16;3.18;4.66;3.56;6.62;4.69;2.96;4.27;3.1;3.49;2.38;2.54;3.18;4.54;5.17;3.34;.46;4.2;3.77;2.53;1.27;3.6;4.46;.83;1.58;4.57;2.92;2.17;2.81;3.17;1.48;1.13;8.45;2.85;1.15;4.09;5.67;3.85;.82;1.15;3.3;3.95;.21;4.67;1.19;2.83;5.43;3.54;.76;5.35;3.81;3.85;3.23;2.46;5.65;5.76;3.97;6.32;3.35;2.86;5.95;6.63;4.5;2.89;5.45;3.95;4.59;4.34;5.09;4.99;5.24;4.02;4.91;5.83;1.59;2.45;1.71;2.51;2.87;2.93;6;3.07;4.58;5.09;5.26;4.44;4.06;3.28;2.1;6.69;4.55;2.58;5.57;4.79;3.75;.58;1.99;2.45;4.16;3.64;1.55;3.8;2.09;3.71;4.36;2.27;4.31;4.26;3.43;1.01;.44;1.57;1.7;2.68;1.46;1.5;1.7;.7;.92;1.94;1.42;2.39;.76;3.24;3.14;1.01;2.85;1.17;1.48;1.22;.67;.8;.75;3.02;1.12;1.34;4.82;3.73;1.26;.92;-.86;1.13;.65;1.18;.06;-.67;.8;.2;.41;.35;.05;.69;1.11;1.09;.06;1.29;2.34;.9;-.06;.36;1.24;-.18;.06;-.14;.87;.86;2.37;1.38;.52;1.97;.97;4.2;1.48;4.38;5.02;3.81;4.14;3.29;4.98;4.21;4.73;5.01;2.84;4.03;4.01;4.89;4.64;2.37;1.66;2.52;5.89;3.6;1.53;1.1;2.19;1.25;1;2.33;2.53;1.89;2.01;1;.84;-.21;.93;2.4;2.06;1.91;4.48;.3;3.03;2.21;2.16;4.76;4.08;4.42;4.07;5.72;3.13;3.52;2.69;5.1;3.98;3.99;5.24;5.81;2.02;1.75;3.18;2.96;4.36;4.26;3.2;3.71;1.85;4.88;2.93;4.94;4.22;2.17;1.67;2.44;4.68;1.22;3.37;1.68;1.73;2.69;1.65;.75;-.94;1.27;.11;-.79;3;1.26;.57;1.71;.45;2.93;1.48;1.67;2.62;3.54;.95;3.03;2.33;2.5;3.92;2.41;3.3;2.08;2.16;.87;1.26;3.18;3.02;4.04;1.48;.78;4.1;2.06;.59;1.89;.25;.44;-.23;-.57;-.45;.33;-.12;.22;-1.71;.1;.63;.58;.04;-.21;.5;-2.18;.06;.06;-.46;-.39;-.23;.26;.36;.42;.86;1.85;.56;.27;-.51;.65;.42;-.35;.43;.93;.05;-.66;.28;.09;.41;-.05;.46;.51;.02;.67;.11;-.55;.27;.09;.77;-.18;.11;2.34;.99;.1;.78;.52;1.23;.3;.04;.31;-.42;.13;.78;1.01;-.48;.98;.14;.65;.25;.97;-.67;.8;.49;1.15;.15;-1.05;.52;1.43;.04;.68;-.14;-.15;.24;.73;1.71;.89;-.05;.68;.53;.1;.52;.81;1.56;.28;.07;2.18;.55;1.24;2.33;.78;-.04;.77;-.09;.4;.61;.96;1.59;1.57;1.71;1.52;1.41;1.11;1.48;.52;1.61;.76;2.41;.05;.73;-.38;.4;1.78;.95;.46;2.29;1.05;1.94;1.74;.88;1.32;1.97;1.66;1.64;1.17;2.12;2.38;2.63;2.64;1.63;2.49;3.23;1.49;4.21;5.62;5.72;5.3;.78;3.01;4.23;3.16;3.53;5.02;3.73;4.38;4.53;4.4;6.09;6.37;5.1;5.65;4.64;5.8;4.94;5.24;4.63;6.08;4.74;4.27;4.1;5.34;5.96;3.64;2.03;3.29;5.31;2.6;2.34;2.25;4.26;2.1;2.08;2.48;.6;1.53;.31;1.96;4.53;3.8;3.27;.67;5.5;2.33;4.89;2.31;3.4;4.37;3.71;4.24;2.89;3.27;2.97;5.14;7.61;2.14;5.73;3.45;4.27;3.13;5.77;4.91;2.21;2.5;4.74;3.57;4.6;3.43;3.41;6.01;3.21;5.22;3.11;6.33;4.8;4.29;5.23;3.28;6.47;2.48;6.01;3.17;5.92;1.38;1.79;2.96;2.67;2.21;2.67;1.89' # no data for this one, lets see if the pipeline handles such cases well assert proteins['NM_000600'].conservation is None db.session.add_all(proteins.values()) db.session.commit() assert Protein.query.filter_by( refseq='NM_000600').one().conservation == ''
def test_network_export(self, do_export=None): filename = make_named_temp_file() with self.app.app_context(): test_models = create_test_models() db.session.add_all(test_models.values()) if do_export: do_export(filename) else: namespace = Namespace( exporters=['site_specific_network_of_kinases_and_targets'], paths=[filename]) ProteinRelated.export(namespace) with open(filename) as f: assert f.readlines() == [ 'kinase symbol\ttarget symbol\tkinase refseq\ttarget refseq\ttarget sequence position\ttarget amino acid\n', 'Kinase name\tSOMEGENE\tNM_0002\tNM_0001\t1\tA\n' ]
def test_domains_hierarchy(self): existing_top_level_domain = InterproDomain(accession='IPR000008', description='C2 domain') existing_domain = InterproDomain(accession='IPR033884', description='Calpain C2 domain') db.session.add_all([existing_top_level_domain, existing_domain]) filename = make_named_temp_file(domains_hierarchy_data) new_domains = domains_hierarchy(filename) assert len(new_domains) == 16 - 2 assert existing_top_level_domain.level == 0 assert existing_top_level_domain.parent is None assert existing_domain.parent is existing_top_level_domain new_domains = {domain.accession: domain for domain in new_domains} # this domain was already was in database assert 'IPR033884' not in new_domains # does a new top level domain have a None parent? expected_top_level_domains = ['IPR000010', 'IPR000056'] for domain in expected_top_level_domains: assert new_domains[domain].parent is None expected_parents = { 'IPR025760': 'IPR000010', # had first-level domain assigned correct parent? 'IPR018090': 'IPR000053', # is parent assignment working when going one level back? 'IPR026019': 'IPR000056' # does going multiple levels higher back work? } for child, parent in expected_parents.items(): assert new_domains[child].parent is new_domains[parent]
def test_classification(self): """Following assertion about data file holds: - 'family' fits better to our 'group' than any other column - 'gene.clean', not 'Kinase' is being used as kinase name as it fits much better. """ existing_kinases = { name: Kinase(name=name) for name in ('AKT1', 'Akt2', 'CIT') } existing_groups = {name: KinaseGroup(name=name) for name in ('Akt', )} def add_to_session(): db.session.add_all(existing_kinases.values()) db.session.add_all(existing_groups.values()) filename = make_named_temp_file(raw_gene_list) add_to_session() with self.app.app_context(): new_groups = load_kinase_classification(filename) assert len(new_groups) == 1 new_group = new_groups[0] assert new_group.name == 'DMPK' add_to_session() assert len(new_group.kinases) == 2 assert existing_kinases['CIT'] in new_group.kinases old_group = existing_groups['Akt'] assert len(old_group.kinases) == 3 assert existing_kinases['AKT1'] in old_group.kinases assert existing_kinases['Akt2'] in old_group.kinases
def test_gene_lists(self): filename = make_named_temp_file(raw_gene_list) with self.app.app_context(): gene_lists = load_active_driver_gene_lists(lists=(ListData( name='TCGA list', path=filename, mutations_source=TCGAMutation), )) # one gene list returned (TCGA) assert len(gene_lists) == 1 gene_list = gene_lists[0] # correct name of gene list assert gene_list.name == 'TCGA list' assert gene_list.mutation_source_name == 'TCGA' # let's take some entry an_entry = gene_list.entries[0] assert type(an_entry.p) is float assert type(an_entry.fdr) is float genes = [entry.gene.name for entry in gene_list.entries] assert 'TMEM131' not in genes assert 'PNPT1' in genes db.session.add_all(gene_lists) with self.app.app_context(): gene_lists = load_active_driver_gene_lists(lists=(ListData( name='TCGA list', path=filename, mutations_source=TCGAMutation), )) # no duplicate lists should be created assert not gene_lists
def test_import(self): protein = Protein( refseq='NM_001010', sequence='MKLNISFPATGCQKLIEVDDERKLRTFYEKRMATEVAADALGEEWKGYVVRISGGNDKQGFPMKQGVLTHGRVRLLLSKGHSCYRPRRTGERKRKSVRGCIVDANLSVLNLVIVKKGEKDIPGLTDTTVPRRLGPKRASRIRKLFNLSKEDDVRQYVVRKPLNKEGKKPRTKAPKIQRLVTPRVLQHKRRRIALKKQRTKKNKEEAAEYAKLLAKRMKEAKEKRQEQIAKRRRLSSLRASTSKSESSQK*' ) db.session.add(protein) with TemporaryDirectory() as dir_path: with gzip.open(dir_path + '/O-GalNAc_site_dataset.gz', 'wt') as f: f.write(SITES) importer = PhosphoELMImporter( make_named_gz_file(CANONICAL), make_named_gz_file(ALTERNATIVE), make_named_gz_file(MAPPINGS) ) sites = importer.load_sites(make_named_temp_file(SITES)) assert len(sites) == 2 sites_by_pos = {site.position: site for site in sites} assert sites_by_pos[236].residue == sites_by_pos[242].residue == 'S' assert sites_by_pos[236].types_names == sites_by_pos[242].types_names == {'phosphorylation'} assert sites_by_pos[236].pmid == {17360704} # N.N. should be ignored assert sites_by_pos[242].pmid == {18669648} assert {kinase.name for kinase in sites_by_pos[236].kinases} == {'P70S6K'} assert {group.name for group in sites_by_pos[236].kinase_groups} == {'RSK'} assert len(sites_by_pos[242].kinases) == 0
def test_ptm_muts_of_gene(self): filename = make_named_temp_file() with self.app.app_context(): from models import clear_cache clear_cache() test_models = create_test_models() db.session.add_all(test_models.values()) from exports.protein_data import ptm_muts_of_gene ptm_muts_of_gene(path_template=filename, mutation_source='mc3', gene='SOMEGENE', site_type='glycosylation', export_samples=True) with open(filename) as f: assert f.readlines() == [ 'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\tsample_id\n', 'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample A\n', 'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample B\n' ]
def test_import(self): proteins = create_test_proteins([ 'NM_000689', 'NM_001997', 'NM_000690', 'NM_001204889', 'NM_000546' ]) # Sequence is needed for validation. Validation is tested on model level. sequences = { 'NM_000689': 'MSSSGTPDLPVLLTDLKIQYTKIFINNEWHDSVSGKKFPVFNPATEEELCQVEEGDKEDVDKAVKAARQAFQIGSPWRTMDASERGRLLYKLADLIERDRLLLATMESMNGGKLYSNAYLNDLAGCIKTLRYCAGWADKIQGRTIPIDGNFFTYTRHEPIGVCGQIIPWNFPLVMLIWKIGPALSCGNTVVVKPAEQTPLTALHVASLIKEAGFPPGVVNIVPGYGPTAGAAISSHMDIDKVAFTGSTEVGKLIKEAAGKSNLKRVTLELGGKSPCIVLADADLDNAVEFAHHGVFYHQGQCCIAASRIFVEESIYDEFVRRSVERAKKYILGNPLTPGVTQGPQIDKEQYDKILDLIESGKKEGAKLECGGGPWGNKGYFVQPTVFSNVTDEMRIAKEEIFGPVQQIMKFKSLDDVIKRANNTFYGLSAGVFTKDIDKAITISSALQAGTVWVNCYGVVSAQCPFGGFKMSGNGRELGEYGFHEYTEVKTVTVKISQKNS*', 'NM_001997': 'MQLFVRAQELHTFEVTGQETVAQIKAHVASLEGIAPEDQVVLLAGAPLEDEATLGQCGVEALTTLEVAGRMLGGKVHGSLARAGKVRGQTPKVAKQEKKKKKTGRAKRRMQYNRRFVNVVPTFGKKKGPNANS*', 'NM_000690': 'MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTFPTVNPSTGEVICQVAEGDKEDVDKAVKAARAAFQLGSPWRRMDASHRGRLLNRLADLIERDRTYLAALETLDNGKPYVISYLVDLDMVLKCLRYYAGWADKYHGKTIPIDGDFFSYTRHEPVGVCGQIIPWNFPLLMQAWKLGPALATGNVVVMKVAEQTPLTALYVANLIKEAGFPPGVVNIVPGFGPTAGAAIASHEDVDKVAFTGSTEIGRVIQVAAGSSNLKRVTLELGGKSPNIIMSDADMDWAVEQAHFALFFNQGQCCCAGSRTFVQEDIYDEFVERSVARAKSRVVGNPFDSKTEQGPQVDETQFKKILGYINTGKQEGAKLLCGGGIAADRGYFIQPTVFGDVQDGMTIAKEEIFGPVMQILKFKTIEEVVGRANNSTYGLAAAVFTKDLDKANYLSQALQAGTVWVNCYDVFGAQSPFGGYKMSGSGRELGEYGLQAYTEVKTVTVKVPQKNS*', 'NM_001204889': 'MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTFPTVNPSTGEVICQVAEGDKALETLDNGKPYVISYLVDLDMVLKCLRYYAGWADKYHGKTIPIDGDFFSYTRHEPVGVCGQIIPWNFPLLMQAWKLGPALATGNVVVMKVAEQTPLTALYVANLIKEAGFPPGVVNIVPGFGPTAGAAIASHEDVDKVAFTGSTEIGRVIQVAAGSSNLKRVTLELGGKSPNIIMSDADMDWAVEQAHFALFFNQGQCCCAGSRTFVQEDIYDEFVERSVARAKSRVVGNPFDSKTEQGPQVDETQFKKILGYINTGKQEGAKLLCGGGIAADRGYFIQPTVFGDVQDGMTIAKEEIFGPVMQILKFKTIEEVVGRANNSTYGLAAAVFTKDLDKANYLSQALQAGTVWVNCYDVFGAQSPFGGYKMSGSGRELGEYGLQAYTEVKTVTVKVPQKNS*', 'NM_000546': 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD*' } for isoform, sequence in sequences.items(): proteins[isoform].sequence = sequence db.session.add_all(proteins.values()) # Add gene to test cross-isoform mapping aldh2 = gene_from_isoforms(proteins, ['NM_000690', 'NM_001204889']) db.session.add(aldh2) importer = HPRDImporter(make_named_temp_file(SEQUENCES), make_named_temp_file(MAPPINGS), dir_path='') assert len(importer.mappings) == 4 with warns(None) as warnings: sites = importer.load_sites(path=make_named_temp_file(SITES)) for warning in warnings.list: warn(warning.message) assert all(warning.category is not UserWarning for warning in warnings.list) # should have 5 pre-defined sites and one mapped (isoform NM_001204889) assert len(sites) == 5 + 1 db.session.add_all(sites) db.session.commit() sites_by_isoform = {site.protein.refseq: site for site in sites} assert sites_by_isoform['NM_001204889'].residue == sites_by_isoform[ 'NM_000690'].residue == 'S' assert sites_by_isoform['NM_000689'].position == 2 assert len(sites_by_isoform['NM_000689'].kinases) == 0 tp_53_sites = { site.position: site for site in sites if site.protein.refseq == 'NM_000546' } tp53_6s = tp_53_sites[6] assert tp53_6s.residue == 'S' assert {kinase.name for kinase in tp53_6s.kinases} == {'CSNK1A1'} tp53_46s = tp_53_sites[46] assert {kinase.name for kinase in tp53_46s.kinases} == {'ATM', 'HIPK2'} phosphorylation = SiteType.query.filter_by( name='phosphorylation').one() assert list(tp53_6s.kinases)[0].is_involved_in == {phosphorylation}
def test_protein_references(self): uniprot_filename = make_named_temp_file(data=idmapping_dat, opener=gzip.open, mode='wt') reflink_filename = make_named_temp_file(data=reflink_data, opener=gzip.open, mode='wt', suffix='.gz') refseq_filename = make_named_temp_file(data=refseq_data) refseqs = [ 'NM_011739', # present in reference mappings 'NM_001131572', # present 'NM_201200', # present 'NM_0001' # not present in reference mappings ] g = Gene(name='Some gene') proteins_we_have = { refseq_nm: Protein(refseq=refseq_nm, gene=g) for refseq_nm in refseqs } tp53 = Gene(name='TP53') tp53_protein = Protein(refseq='NM_000546', gene=tp53) with self.app.app_context(): # let's pretend that we already have some proteins in our db db.session.add_all(proteins_we_have.values()) db.session.add(tp53_protein) references = load_external_references(uniprot_filename, refseq_filename, reflink_filename) # there are 3 references we would like to have extracted assert len(references) == 3 protein = proteins_we_have['NM_011739'] assert len(protein.external_references.uniprot_entries) == 2 uniprot_entry = protein.external_references.uniprot_entries[1] assert uniprot_entry.accession == 'P68254' assert uniprot_entry.isoform == 1 assert uniprot_entry.reviewed is True uniprot_entry = protein.external_references.uniprot_entries[0] assert uniprot_entry.reviewed is False ensembl_peptides = protein.external_references.ensembl_peptides assert len(ensembl_peptides) == 2 assert (set(ensembl.peptide_id for ensembl in ensembl_peptides) == { 'ENSMUSP00000106602', 'ENSMUSP00000100067' }) protein = proteins_we_have['NM_001131572'] assert len(protein.external_references.uniprot_entries) == 1 uniprot_entry = protein.external_references.uniprot_entries[0] assert uniprot_entry.accession == 'Q5RFJ2' assert uniprot_entry.isoform == 1 assert uniprot_entry.reviewed is False # check if protein without references stays clear protein = proteins_we_have['NM_0001'] # it's needed to re-add the protein cause ORM will emit a query # (just in case, that's how flask-testing works - any object needs # to be re-added to session after its termination) db.session.add(protein) assert protein.external_references is None # check the protein with refseq references and gene with entrez id assert tp53_protein.external_references.refseq_np == 'NP_000537' assert tp53.entrez_id == 7157