Beispiel #1
0
    def test_mutations_export(self):

        mc3_filename = make_named_temp_file()
        clinvar_filename = make_named_temp_file()

        with self.app.app_context():
            test_models = create_test_models()
            db.session.add_all(test_models.values())

            protein = test_models['protein']

            muts_import_manager.perform('export', [protein], ['mc3'],
                                        {'mc3': mc3_filename})
            muts_import_manager.perform('export', [protein], ['clinvar'],
                                        {'clinvar': clinvar_filename})

        with gzip.open(mc3_filename) as f:
            assert f.readlines() == [
                b'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\n',
                b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN'
            ]

        with gzip.open(clinvar_filename) as f:
            assert f.readlines() == [
                b'gene\tisoform\tposition\twt_residue\tmut_residue\tdisease\n',
                b'SOMEGENE\tNM_0001\t1\tA\tE\tSome disease'
            ]
Beispiel #2
0
    def test_exceptions(self):

        sites_data = (
            # this odd case is real:
            '02098	NONO	02098_1	NP_031389.3	0	Y	-	-	Acetylation	in vivo	19608861'
            # there are 9 such cases in HPRD at the time of this test creation
        )
        mappings = '02098	NONO	NM_007363.4	NP_031389.3	4841	300084	Q15233,B7Z4C2	Non pou domain containing octamer binding protein'
        sequences = (
            '>02098|02098_1|NP_031389.3|Non pou domain containing octamer binding protein\n'
            'MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQQASSQNEGLTIDLKNFRKPGEKTFTQRSRLFVG'
            # the main part of the sequence was cut out as it is not needed
            # but this is the important bit: it has 'Y' at the end:
            'GTLGLTPPTTERFGQAATMEGIGAIGGTPPAFNRAAPGAEFAPNKRRRY'
            # so having 1-based positioning system, after a naive conversion to 0-based:
            # site.pos = -1; furthermore, sequence[site.pos] == 'Y' (!); this is probably
            # why the pos = '0' had been saved in HPRD in the first place.
        )
        protein = Protein(
            refseq='NM_007363',
            sequence=
            'MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQQASSQNEGLTIDLKNFRKPGEKTFTQRSRLFVG'
            'GTLGLTPPTTERFGQAATMEGIGAIGGTPPAFNRAAPGAEFAPNKRRRY')
        db.session.add(protein)

        importer = HPRDImporter(make_named_temp_file(sequences),
                                make_named_temp_file(mappings),
                                dir_path='')

        # without a fix, it should warn and reject the faulty site
        with warns(
                UserWarning,
                match='The site: 02098_1: 0Y is outside of the protein sequence'
        ):
            sites = importer.load_sites(path=make_named_temp_file(sites_data),
                                        pos_zero_means_last_aa=False)

        assert len(sites) == 0

        # and it should work when a workaround is applied
        with warns(None) as warnings:
            sites = importer.load_sites(path=make_named_temp_file(sites_data),
                                        pos_zero_means_last_aa=True)

        assert all(warning.category is not UserWarning
                   for warning in warnings.list)
        assert len(sites) == 1

        site = sites[0]

        assert site.position == 128
        assert site.residue == 'Y'
    def test_export_paths(self):

        name_1 = make_named_temp_file()
        name_2 = make_named_temp_file()

        # user gave too many paths
        msg, error = self.run_command('export protein_related -e sites_ac --paths %s %s' % (name_1, name_2))
        assert 'Export paths should be given for every exported file, no less, no more.' in msg

        # user gave good number of paths
        msg, error = self.run_command('export protein_related -e sites_ac --paths %s' % name_1)
        assert 'Export paths should be given for every exported file, no less, no more.' not in msg
        assert ('Exported sites_ac to %s' % name_1) in msg
    def test_mutations_export(self):

        cases = (('mc3', {}, [
            b'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\tcount\n',
            b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\t2'
        ]), ('mc3', {
            'export_samples': True
        }, [
            b'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\tsample_id\n',
            b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample A\n',
            b'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample B'
        ]), ('clinvar', {}, [
            b'gene\tisoform\tposition\twt_residue\tmut_residue\tdisease\tsignificance\n',
            b'SOMEGENE\tNM_0001\t1\tA\tE\tSome disease\tPathogenic\n',
            b'SOMEGENE\tNM_0001\t1\tA\tE\tOther disease\tBenign'
        ]))

        with self.app.app_context():
            test_models = create_test_models()
            db.session.add_all(test_models.values())

            protein = test_models['protein']

            for source, kwargs, expected_lines in cases:

                filename = make_named_temp_file()

                muts_import_manager.perform('export', [protein], [source],
                                            paths={source: filename},
                                            **kwargs)

                with gzip.open(filename) as f:
                    assert f.readlines() == expected_lines
    def test_others_import(self):

        make_test_shared_proteins()

        importer = OthersUniprotImporter(*OtherSitesData.files_for_init())

        sites = importer.load_sites(
            path=make_named_temp_file(OtherSitesData.sites))

        assert len(sites) == 1

        # test ECO filtering
        sites = importer.load_sites(path=make_named_temp_file(
            # any other code should cause the entry to be ignored
            OtherSitesData.sites.replace('ECO_0000269', 'ECO_0000200')))

        assert len(sites) == 0
Beispiel #6
0
    def test_gene_full_name(self):

        gene = Gene(name='TP53', entrez_id=7157)
        db.session.add(gene)

        filename = make_named_temp_file(full_gene_names, mode='wt', opener=gzip.open)

        load_full_gene_names(filename)

        assert gene.full_name == 'tumor protein p53'
Beispiel #7
0
    def test_disorder(self):

        proteins = create_test_proteins(['NM_002749', 'NM_000600'])

        filename = make_named_temp_file(disorder_data)

        with self.app.app_context():
            load_disorder(filename)

        assert proteins['NM_002749'].disorder_map == '111111111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
        assert proteins['NM_000600'].disorder_map == '11111111111101000000000000000011111111111111100000000000000000000000000000000000000000000000'
Beispiel #8
0
    def test_sequences(self):

        proteins = create_test_proteins(['NM_002749', 'NM_021806', 'NM_001204289'])

        filename = make_named_temp_file(fasta_sequences)

        with self.app.app_context():
            load_sequences(filename)

        protein = proteins['NM_021806']
        assert protein.sequence == 'MRLAGPLRIVVLVVSVGVTWIVVSILLGGPGSGFPRIQQLFTSPESSVTAAPRARKYKCGLPQPCPEEHLAFRVVSGAANVIGPKICLEDKMLMSSVKDNVGRGLNIALVNGVSGELIEARAFDMWAGDVNDLLKFIRPLHEGTLVFVASYDDPATKMNEETRKLFSELGSRNAKELAFRDSWVFVGAKGVQNKSPFEQHVKNSKHSNKYEGWPEALEMEGCIPRRSTAS*'
        assert protein.length == len('MRLAGPLRIVVLVVSVGVTWIVVSILLGGPGSGFPRIQQLFTSPESSVTAAPRARKYKCGLPQPCPEEHLAFRVVSGAANVIGPKICLEDKMLMSSVKDNVGRGLNIALVNGVSGELIEARAFDMWAGDVNDLLKFIRPLHEGTLVFVASYDDPATKMNEETRKLFSELGSRNAKELAFRDSWVFVGAKGVQNKSPFEQHVKNSKHSNKYEGWPEALEMEGCIPRRSTAS')
    def test_splice_variants_handling(self):
        """Verify import of sites from a multi-splice variants entry (here: Q12797)"""

        make_test_shared_proteins()

        importer = GlycosylationUniprotImporter(
            *SpliceVariantData.files_for_init())

        sites = importer.load_sites(
            path=make_named_temp_file(SpliceVariantData.sites))

        assert len(sites) == 3
Beispiel #10
0
    def test_protein_summaries(self):

        proteins = create_test_proteins(['NM_010410', 'NM_182751'])

        filename = make_named_temp_file(summaries_data, mode='wt', opener=gzip.open)

        with self.app.app_context():

            protein_summaries(path=filename)

        assert proteins['NM_010410'].summary == 'This gene encodes a hypothalamic neuropeptide precursor [...]'
        assert proteins['NM_182751'].summary == 'The protein encoded by this gene is one of the highly [...]'
    def test_proteins_and_genes(self):

        create_test_proteins([])

        filename = make_named_temp_file(protein_data)

        with self.app.app_context():
            new_proteins = proteins_and_genes(path=filename)

        assert len(new_proteins) == 4
        db.session.add_all(new_proteins)

        p = Protein.query.filter_by(refseq='NM_002749').one()
        g = Gene.query.filter_by(name='MAPK7').one()

        assert p.gene == g
        assert p.tx_start == 19281773
        assert p.tx_end == 19286857
        assert p.cds_start == 19282213
        assert p.cds_end == 19286544

        # test genes
        genes = Gene.query.all()
        assert len(genes) == 4

        # test strands:
        assert g.strand is True
        assert Gene.query.filter_by(name='MUC1').one().strand is False

        second_filename = make_named_temp_file(update_data)

        with self.app.app_context():
            new_proteins = proteins_and_genes(path=second_filename)

        assert len(new_proteins) == 1

        protein = list(new_proteins)[0]
        assert protein.refseq == 'NM_182962'
    def test_disorder(self):

        proteins = create_test_proteins(['NM_002749', 'NM_000600'])
        for refseq, protein in proteins.items():
            protein.sequence = sequences_for_proteins[refseq]

        filename = make_named_temp_file(disorder_data)

        with self.app.app_context():
            load_disorder(filename)

        assert proteins[
            'NM_002749'].disorder_map == '111111111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
        assert proteins[
            'NM_000600'].disorder_map == '11111111111101000000000000000011111111111111100000000000000000000000000000000000000000000000'
Beispiel #13
0
    def test_sites(self):
        proteins = create_test_proteins(['NM_003955'])
        # Sequence is needed for validation. Validation is tested on model level.
        proteins['NM_003955'].sequence = 'MVTHSKFPAAGMSRPLDTSLRLKTFSSKSEYQLVVNAVRKLQESGFYWSAVTGGEANLLLSAEPAGTFLIRDSSDQRHFFTLSVKTQSGTKNLRIQCEGGSFSLQSDPRSTQPVPRFDCVLKLVHHYMPPPGAPSFPSPPTEPSSEVPEQPSAQPLPGSPPRRAYYIYSGGEKIPLVLSRPLSSNVATLQHLCRKTVNGHLDSYEKVTQLPGPIREFLDQYDAPL*'

        filename = make_named_temp_file(sites_data)

        sites = load_sites(filename)

        assert len(sites) == 3
        sites = {site.position: site for site in sites}

        assert sites[6].residue == 'K'
        assert sites[6].type == 'ubiquitination'

        assert {kinase.name for kinase in sites[204].kinases} == {'JAK2', 'LCK'}
    def test_sites_export(self):

        filename = make_named_temp_file()

        with self.app.app_context():
            test_models = create_test_models()
            db.session.add_all(test_models.values())

            namespace = Namespace(exporters=['sites_ac'], paths=[filename])
            ProteinRelated().export(namespace)

        with open(filename) as f:
            assert f.readlines() == [
                'gene\tposition\tresidue\ttype\tkinase\tpmid\n',
                'SOMEGENE\t1\tA\tglycosylation\tKinase name\t1,2\n'
            ]
    def test_cancer(self):

        filename = make_named_temp_file(cancers_list)

        with self.app.app_context():
            cancers = load_cancers(path=filename)

        # two cancers should be returned
        assert len(cancers) == 2

        cancer = cancers[0]

        assert cancer.name == 'Bladder Urothelial Carcinoma'
        assert cancer.code == 'BLCA'

        db.session.add_all(cancers)
        assert True
Beispiel #16
0
    def test_ptm_mutations(self):

        filename = make_named_temp_file()

        with self.app.app_context():
            test_models = create_test_models()
            db.session.add_all(test_models.values())

            namespace = Namespace(exporters=['mc3_muts_affecting_ptm_sites'],
                                  paths=[filename])
            ProteinRelated.export(namespace)

        with open(filename) as f:
            assert f.readlines() == [
                'gene	refseq	mutation position	mutation alt	mutation summary	site position	site residue\n',
                'SOMEGENE\tNM_0001\t1\tE\tCAN\t1\tA\n'
            ]
Beispiel #17
0
    def test_domains(self):
        proteins = [
            Protein(
                refseq='NM_018163',
                sequence=
                'MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDNPRAAELFHQLSQALEVLTDAAARAAYDKVRKAKKQAAERTQKLDEKRKKVKLDLEARERQAQAQESEEEEESRSTRTLEQEIERLREEGSRQLEEQQRLIREQIRQERDQRLRGKAENTEGQGTPKLKLKWKCKKEDESKGGYSKDVLLRLLQKYGEVLNLVLSSKKPGTAVVEFATVKAAELAVQNEVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMMRMRQAAERQQLIARMQQEDQEGPPT*',
                gene=Gene(chrom='15')),
            Protein(
                refseq='NM_004671',
                sequence=
                'MADFEELRNMVSSFRVSELQVLLGFAGRNKSGRKHDLLMRALHLLKSGCSPAVQIKIRELYRRRYPRTLEGLSDLSTIKSSVFSLDGGSSPVEPDLAVAGIHSLPSTSVTPHSPSSPVGSVLLQDTKPTFEMQQPSPPIPPVHPDVQLKNLPFYDVLDVLIKPTSLVQSSIQRFQEKFFIFALTPQQVREICISRDFLPGGRRDYTVQVQLRLCLAETSCPQEDNYPNSLCIKVNGKLFPLPGYAPPPKNGIEQKRPGRPLNITSLVRLSSAVPNQISISWASEIGKNYSMSVYLVRQLTSAMLLQRLKMKGIRNPDHSRALIKEKLTADPDSEIATTSLRVSLMCPLGKMRLTIPCRAVTCTHLQCFDAALYLQMNEKKPTWICPVCDKKAAYESLILDGLFMEILNDCSDVDEIKFQEDGSWCPMRPKKEAMKVSSQPCTKIESSSVLSKPCSVTVASEASKKKVDVIDLTIESSSDEEEDPPAKRKCIFMSETQSSPTKGVLMYQPSSVRVPSVTSVDPAAIPPSLTDYSVPFHHTPISSMSSDLPGLDFLSLIPVDPQYCPPMFLDSLTSPLTASSTSVTTTSSHESSTHVSSSSSRSETGVITSSGSNIPDIISLD*',
                gene=Gene(chrom='18'))
        ]

        db.session.add_all(proteins)

        filename = make_named_temp_file(domains_data)

        new_domains = load_domains(filename)

        assert len(new_domains) == 6
        assert len(proteins[0].domains) == 2

        domains = defaultdict(list)

        for domain in proteins[1].domains:
            domains[domain.interpro.short_description].append(domain)

        def assert_ranges(domain, start, end):
            assert domain.start == start and domain.end == end

        # two SAP domains should be merged for representation purposes due to similarity criteria
        # (these two domain annotation overlap so the smaller one is contained in the bigger)
        sap_domain = domains['SAP_dom'][0]
        assert_ranges(sap_domain, 1, 65)

        intepro_domain = sap_domain.interpro
        assert intepro_domain.accession == 'IPR003034'
        assert intepro_domain.description == 'SAP domain'

        # here the two annotations overlap with more than 75% of common
        assert_ranges(domains['PINIT'][0], 141, 299)

        # and here overlap was too small to merge those domains
        assert len(domains['Znf_MIZ']) == 2
    def test_missing_sequence(self):
        """Sometimes (though rarely) there is no sequence for given accession.

        This happens when UniProt fasta files are not in sync with SPRQL API.
        """

        site_with_missing_sequence = (
            'primary_accession,sequence_accession,position,data,eco,source\n'
            '"B2RDS2","B2RDS2-1","79^","N-linked (GlcNAc...) asparagine","ECO_0000269",'
        )

        importer = GlycosylationUniprotImporter(
            make_named_gz_file(''), make_named_gz_file(''),
            make_named_gz_file('B2RDS2	RefSeq_NT	NM_004823.1'))

        with warns(UserWarning, match='No sequence for .* found!'):
            importer.load_sites(
                make_named_temp_file(site_with_missing_sequence))
    def test_glycosylation_import(self):
        # P01891 is not mappable to refseq, we should be warned about that
        proteins = create_test_proteins(
            ['NM_001163941', 'NM_005514', 'NM_178559'])

        # sequence is needed for validation. Validation is tested on model level.
        sequences = {
            'NM_001163941':
            'MENSERAEEMQENYQRNGTAEEQPKLRKEAVGSIEIFRFADGLDITLMILGILASLVNGACLPLMPLVLGEMSDNLISGCLVQTNTTNYQNCTQSQEKLNEDMTLLTLYYVGIGVAALIFGYIQISLWIITAARQTKRIRKQFFHSVLAQDIGWFDSCDIGELNTRMTDDIDKISDGIGDKIALLFQNMSTFSIGLAVGLVKGWKLTLVTLSTSPLIMASAAACSRMVISLTSKELSAYSKAGAVAEEVLSSIRTVIAFRAQEKELQRYTQNLKDAKDFGIKRTIASKVSLGAVYFFMNGTYGLAFWYGTSLILNGEPGYTIGTVLAVFFSVIHSSYCIGAAVPHFETFAIARGAAFHIFQVIDKKPSIDNFSTAGYKPESIEGTVEFKNVSFNYPSRPSIKILKGLNLRIKSGETVALVGLNGSGKSTVVQLLQRLYDPDDGFIMVDENDIRALNVRHYRDHIGVVSQEPVLFGTTISNNIKYGRDDVTDEEMERAAREANAYDFIMEFPNKFNTLVGEKGAQMSGGQKQRIAIARALVRNPKILILDEATSALDSESKSAVQAALEKASKGRTTIVVAHRLSTIRSADLIVTLKDGMLAEKGAHAELMAKRGLYYSLVMSQDIKKADEQMESMTYSTERKTNSLPLHSVKSIKSDFIDKAEESTQSKEISLPEVSLLKILKLNKPEWPFVVLGTLASVLNGTVHPVFSIIFAKIITMFGNNDKTTLKHDAEIYSMIFVILGVICFVSYFMQGLFYGRAGEILTMRLRHLAFKAMLYQDIAWFDEKENSTGGLTTILAIDIAQIQGATGSRIGVLTQNATNMGLSVIISFIYGWEMTFLILSIAPVLAVTGMIETAAMTGFANKDKQELKHAGKIATEALENIRTIVSLTREKAFEQMYEEMLQTQHRNTSKKAQIIGSCYAFSHAFIYFAYAAGFRFGAYLIQAGRMTPEGMFIVFTAIAYGAMAIGETLVLAPEYSKAKSGAAHLFALLEKKPNIDSRSQEGKKPDTCEGNLEFREVSFFYPCRPDVFILRGLSLSIERGKTVAFVGSSGCGKSTSVQLLQRLYDPVQGQVLFDGVDAKELNVQWLRSQIAIVPQEPVLFNCSIAENIAYGDNSRVVPLDEIKEAANAANIHSFIEGLPEKYNTQVGLKGAQLSGGQKQRLAIARALLQKPKILLLDEATSALDNDSEKVVQHALDKARTGRTCLVVTHRLSAIQNADLIVVLHNGKIKEQGTHQELLRNRDIYFKLVNAQSVQ*',
            'NM_178559':
            'MVDENDIRALNVRHYRDHIGVVSQEPVLFGTTISNNIKYGRDDVTDEEMERAAREANAYDFIMEFPNKFNTLVGEKGAQMSGGQKQRIAIARALVRNPKILILDEATSALDSESKSAVQAALEKASKGRTTIVVAHRLSTIRSADLIVTLKDGMLAEKGAHAELMAKRGLYYSLVMSQDIKKADEQMESMTYSTERKTNSLPLHSVKSIKSDFIDKAEESTQSKEISLPEVSLLKILKLNKPEWPFVVLGTLASVLNGTVHPVFSIIFAKIITMFGNNDKTTLKHDAEIYSMIFVILGVICFVSYFMQGLFYGRAGEILTMRLRHLAFKAMLYQDIAWFDEKENSTGGLTTILAIDIAQIQGATGSRIGVLTQNATNMGLSVIISFIYGWEMTFLILSIAPVLAVTGMIETAAMTGFANKDKQELKHAGKIATEALENIRTIVSLTREKAFEQMYEEMLQTQHRNTSKKAQIIGSCYAFSHAFIYFAYAAGFRFGAYLIQAGRMTPEGMFIVFTAIAYGAMAIGETLVLAPEYSKAKSGAAHLFALLEKKPNIDSRSQEGKKPDTCEGNLEFREVSFFYPCRPDVFILRGLSLSIERGKTVAFVGSSGCGKSTSVQLLQRLYDPVQGQVLFDGVDAKELNVQWLRSQIAIVPQEPVLFNCSIAENIAYGDNSRVVPLDEIKEAANAANIHSFIEGLPEKYNTQVGLKGAQLSGGQKQRLAIARALLQKPKILLLDEATSALDNDSEKVVQHALDKARTGRTCLVVTHRLSAIQNADLIVVLHNGKIKEQGTHQELLRNRDIYFKLVNAQSVQ*',
            'NM_005514':
            'MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREEPRAPWIEQEGPEYWDRNTQIYKAQAQTDRESLRNLRGYYNQSEAGSHTLQSMYGCDVGPDGRLLRGHDQYAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGECVEWLRRYLENGKDKLERADPPKTHVTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEPSSQSTVPIVGIVAGLAVLAVVVIGAVVAAVMCRRKSSGGKGGSYSQAACSDSAQGSDVSLTA'
        }

        for isoform, sequence in sequences.items():
            proteins[isoform].sequence = sequence

        db.session.add_all(proteins.values())

        # Add gene to test cross-isoform mapping
        abcb5 = gene_from_isoforms(proteins, ['NM_001163941', 'NM_178559'])
        db.session.add(abcb5)

        importer = GlycosylationUniprotImporter(
            make_named_gz_file(GlycosylationData.canonical),
            make_named_gz_file(GlycosylationData.alternative),
            make_named_gz_file(GlycosylationData.mappings))

        assert len(importer.mappings) == 3

        sites = importer.load_sites(
            path=make_named_temp_file(GlycosylationData.sites))

        # should have 2 pre-defined sites (3 but one without refseq equivalent) and one mapped (isoform NM_178559)
        assert len(sites) == 2 + 1

        db.session.add_all(sites)
        db.session.flush()

        sites_by_isoform = {site.protein.refseq: site for site in sites}

        assert sites_by_isoform['NM_001163941'].residue == sites_by_isoform[
            'NM_178559'].residue == 'N'
Beispiel #20
0
    def test_browse_list(self):

        from miscellaneous import make_named_temp_file
        from test_imports.test_gene_list import raw_gene_list
        from imports.protein_data import active_driver_gene_lists as load_active_driver_gene_lists

        filename = make_named_temp_file(raw_gene_list)

        # create gene list and genes
        with self.app.app_context():
            from imports.protein_data import ListData
            gene_lists = load_active_driver_gene_lists(lists=(
                ListData(name='TCGA', path=filename, mutations_source=TCGAMutation),
            ))
        db.session.add_all(gene_lists)

        # create preferred isoforms for genes
        for i, gene in enumerate(Gene.query.all()):
            # at least one mutation is required for gene on a gene list to be displayed
            mut = Mutation()
            MC3Mutation(mutation=mut)

            p = Protein(refseq='NM_000%s' % i, mutations=[mut])
            gene.isoforms = [p]
            gene.preferred_isoform = p

        # check the static template
        response = self.client.get('/gene/list/TCGA')
        assert response.status_code == 200
        assert b'TCGA' in response.data

        # check the dynamic data
        response = self.client.get('/gene/list_data/TCGA?order=asc')
        assert response.status_code == 200

        gene_list = GeneList.query.filter_by(name='TCGA').one()

        # all results retrieved
        assert response.json['total'] == len(gene_list.entries)

        # properly sorted by fdr
        fdrs = [row['fdr'] for row in response.json['rows']]
        assert fdrs == sorted(fdrs)
    def test_conservation(self):

        proteins = create_test_proteins(['NM_002749', 'NM_000600'])
        proteins['NM_002749'].gene = Gene(name='MAPK7', chrom='17')
        proteins['NM_000600'].gene = Gene(name='IL6', chrom='7')

        for refseq, protein in proteins.items():
            protein.sequence = sequences_for_proteins[refseq]

        # Generated with:
        """
            import pyBigWig
            test_data = test_data.loc[('chr17', 'NM_002749')].iloc[0]
            full_bw = pyBigWig.open('data/hg19.100way.phyloP100way.bw')
            test_bw = pyBigWig.open('chr17_NM_002749.test_data.bw', 'w')
            needed_chrom_length = max(max(test_data.exonStarts), max(test_data.exonEnds))
            test_bw.addHeader([('chr17', needed_chrom_length)])
            coordinates = sorted(zip(test_data.exonStarts, test_data.exonEnds), key=lambda x: x[0])
            for start, end in zip(test_data.exonStarts, test_data.exonEnds):
                values = full_bw.values('chr17', start, end)
                for i, value in enumerate(values):
                    test_bw.addEntries(['chr17'], starts=[start + i], ends=[start + i + 1], values=[value])
            test_bw.close()
        """
        conservation_big_wig = 'tests/test_imports/chr17_NM_002749.test_data.bw'

        gene_coordinates = make_named_temp_file(coordinates_data)
        load_conservation(conservation_big_wig, gene_coordinates)

        # Protein.query.filter_by(refseq='NM_002749').one().conservation
        assert proteins[
            'NM_002749'].conservation == '3.47;1.67;2.95;1.23;.9;1.64;4.08;1.72;1.25;1.15;2.26;1.69;1.03;1.05;2.39;1.52;2.88;.5;2.28;-.09;2.24;1.32;-.06;.59;1.2;-.13;.37;.76;.04;1.26;-.2;1.84;.97;2.95;5.67;3.98;2.91;4.5;2.33;3.17;4.66;3.73;4.24;4.01;2.49;4.35;5.29;2.95;4.67;5.66;5.36;5.25;4.23;4.87;5.31;5.18;3.52;4.53;5.59;3.55;5.05;6.65;2.01;5.75;5.06;4.88;5.9;4.9;5.63;3.32;3.52;5.09;4.04;4.24;2.24;1.84;2.13;6.12;6.54;2.75;6.08;5.79;5.7;8.26;6.81;5.82;3.83;4.47;3.08;5.11;5;3.4;2.52;4.67;3.97;3.58;4.87;3.72;5.33;4.48;3.92;6.79;3.7;7.52;5.46;4.05;3.07;4.02;5.47;5.31;5.12;7.11;6.59;6.09;5.37;4.73;5.6;6.3;6.52;5.96;3.32;2.84;4.06;.53;3.5;3.79;3.11;1.81;4.51;4.81;3.35;4.29;5.34;5.31;4.63;6.41;2.39;6.46;3.16;8.2;6.23;2.67;6.63;2.39;4.07;5;3.49;4.71;4.16;3.27;.7;6.43;1.1;3.25;2.31;1.57;6.39;5.62;2.68;4.33;6.16;6;3.28;6.16;6.66;3.66;3.77;4.11;6.52;3.69;6.55;5.97;5.28;5.9;3.63;6.16;5.36;5.92;4.24;6.23;4.27;7.07;5.64;6.62;5.32;1.79;6.59;3.26;4.38;3.92;6.2;3.59;5.22;5.76;6.77;4.58;7.52;4.24;3.91;5.75;5.28;6.51;7.43;5.49;2.98;5.48;2.01;2.62;2.19;1.95;1.01;2.03;5.55;1.47;2.2;2.38;5.52;5.93;3.39;6.62;5.51;6.67;5.67;3.9;6.03;8.99;6.67;4.17;4.7;4.01;6.27;3.7;5.06;3.66;2.06;2.64;3.76;2.56;5.94;3.82;2.89;4.05;5.17;6.75;3.05;8.99;4.25;5.57;6.66;6.22;6.06;3.65;4.13;6.66;8.76;2.77;2.34;1.65;1.84;3.91;3.38;5.3;3.49;6.22;3.21;3.91;5.43;2.8;4.28;6.44;3.25;3.89;2.56;5.75;4.58;1.02;5.09;3.53;5.75;5;3.97;3.24;2.08;.17;3.51;2.96;1.66;2.99;3.71;4.75;3.78;5.8;3.71;5.3;4.16;3.18;4.66;3.56;6.62;4.69;2.96;4.27;3.1;3.49;2.38;2.54;3.18;4.54;5.17;3.34;.46;4.2;3.77;2.53;1.27;3.6;4.46;.83;1.58;4.57;2.92;2.17;2.81;3.17;1.48;1.13;8.45;2.85;1.15;4.09;5.67;3.85;.82;1.15;3.3;3.95;.21;4.67;1.19;2.83;5.43;3.54;.76;5.35;3.81;3.85;3.23;2.46;5.65;5.76;3.97;6.32;3.35;2.86;5.95;6.63;4.5;2.89;5.45;3.95;4.59;4.34;5.09;4.99;5.24;4.02;4.91;5.83;1.59;2.45;1.71;2.51;2.87;2.93;6;3.07;4.58;5.09;5.26;4.44;4.06;3.28;2.1;6.69;4.55;2.58;5.57;4.79;3.75;.58;1.99;2.45;4.16;3.64;1.55;3.8;2.09;3.71;4.36;2.27;4.31;4.26;3.43;1.01;.44;1.57;1.7;2.68;1.46;1.5;1.7;.7;.92;1.94;1.42;2.39;.76;3.24;3.14;1.01;2.85;1.17;1.48;1.22;.67;.8;.75;3.02;1.12;1.34;4.82;3.73;1.26;.92;-.86;1.13;.65;1.18;.06;-.67;.8;.2;.41;.35;.05;.69;1.11;1.09;.06;1.29;2.34;.9;-.06;.36;1.24;-.18;.06;-.14;.87;.86;2.37;1.38;.52;1.97;.97;4.2;1.48;4.38;5.02;3.81;4.14;3.29;4.98;4.21;4.73;5.01;2.84;4.03;4.01;4.89;4.64;2.37;1.66;2.52;5.89;3.6;1.53;1.1;2.19;1.25;1;2.33;2.53;1.89;2.01;1;.84;-.21;.93;2.4;2.06;1.91;4.48;.3;3.03;2.21;2.16;4.76;4.08;4.42;4.07;5.72;3.13;3.52;2.69;5.1;3.98;3.99;5.24;5.81;2.02;1.75;3.18;2.96;4.36;4.26;3.2;3.71;1.85;4.88;2.93;4.94;4.22;2.17;1.67;2.44;4.68;1.22;3.37;1.68;1.73;2.69;1.65;.75;-.94;1.27;.11;-.79;3;1.26;.57;1.71;.45;2.93;1.48;1.67;2.62;3.54;.95;3.03;2.33;2.5;3.92;2.41;3.3;2.08;2.16;.87;1.26;3.18;3.02;4.04;1.48;.78;4.1;2.06;.59;1.89;.25;.44;-.23;-.57;-.45;.33;-.12;.22;-1.71;.1;.63;.58;.04;-.21;.5;-2.18;.06;.06;-.46;-.39;-.23;.26;.36;.42;.86;1.85;.56;.27;-.51;.65;.42;-.35;.43;.93;.05;-.66;.28;.09;.41;-.05;.46;.51;.02;.67;.11;-.55;.27;.09;.77;-.18;.11;2.34;.99;.1;.78;.52;1.23;.3;.04;.31;-.42;.13;.78;1.01;-.48;.98;.14;.65;.25;.97;-.67;.8;.49;1.15;.15;-1.05;.52;1.43;.04;.68;-.14;-.15;.24;.73;1.71;.89;-.05;.68;.53;.1;.52;.81;1.56;.28;.07;2.18;.55;1.24;2.33;.78;-.04;.77;-.09;.4;.61;.96;1.59;1.57;1.71;1.52;1.41;1.11;1.48;.52;1.61;.76;2.41;.05;.73;-.38;.4;1.78;.95;.46;2.29;1.05;1.94;1.74;.88;1.32;1.97;1.66;1.64;1.17;2.12;2.38;2.63;2.64;1.63;2.49;3.23;1.49;4.21;5.62;5.72;5.3;.78;3.01;4.23;3.16;3.53;5.02;3.73;4.38;4.53;4.4;6.09;6.37;5.1;5.65;4.64;5.8;4.94;5.24;4.63;6.08;4.74;4.27;4.1;5.34;5.96;3.64;2.03;3.29;5.31;2.6;2.34;2.25;4.26;2.1;2.08;2.48;.6;1.53;.31;1.96;4.53;3.8;3.27;.67;5.5;2.33;4.89;2.31;3.4;4.37;3.71;4.24;2.89;3.27;2.97;5.14;7.61;2.14;5.73;3.45;4.27;3.13;5.77;4.91;2.21;2.5;4.74;3.57;4.6;3.43;3.41;6.01;3.21;5.22;3.11;6.33;4.8;4.29;5.23;3.28;6.47;2.48;6.01;3.17;5.92;1.38;1.79;2.96;2.67;2.21;2.67;1.89'

        # no data for this one, lets see if the pipeline handles such cases well
        assert proteins['NM_000600'].conservation is None

        db.session.add_all(proteins.values())
        db.session.commit()

        assert Protein.query.filter_by(
            refseq='NM_000600').one().conservation == ''
Beispiel #22
0
    def test_network_export(self, do_export=None):

        filename = make_named_temp_file()

        with self.app.app_context():
            test_models = create_test_models()
            db.session.add_all(test_models.values())

            if do_export:
                do_export(filename)
            else:
                namespace = Namespace(
                    exporters=['site_specific_network_of_kinases_and_targets'],
                    paths=[filename])
                ProteinRelated.export(namespace)

        with open(filename) as f:
            assert f.readlines() == [
                'kinase symbol\ttarget symbol\tkinase refseq\ttarget refseq\ttarget sequence position\ttarget amino acid\n',
                'Kinase name\tSOMEGENE\tNM_0002\tNM_0001\t1\tA\n'
            ]
Beispiel #23
0
    def test_domains_hierarchy(self):
        existing_top_level_domain = InterproDomain(accession='IPR000008',
                                                   description='C2 domain')
        existing_domain = InterproDomain(accession='IPR033884',
                                         description='Calpain C2 domain')

        db.session.add_all([existing_top_level_domain, existing_domain])

        filename = make_named_temp_file(domains_hierarchy_data)
        new_domains = domains_hierarchy(filename)

        assert len(new_domains) == 16 - 2

        assert existing_top_level_domain.level == 0
        assert existing_top_level_domain.parent is None

        assert existing_domain.parent is existing_top_level_domain

        new_domains = {domain.accession: domain for domain in new_domains}

        # this domain was already was in database
        assert 'IPR033884' not in new_domains

        # does a new top level domain have a None parent?
        expected_top_level_domains = ['IPR000010', 'IPR000056']

        for domain in expected_top_level_domains:
            assert new_domains[domain].parent is None

        expected_parents = {
            'IPR025760':
            'IPR000010',  # had first-level domain assigned correct parent?
            'IPR018090':
            'IPR000053',  # is parent assignment working when going one level back?
            'IPR026019':
            'IPR000056'  # does going multiple levels higher back work?
        }

        for child, parent in expected_parents.items():
            assert new_domains[child].parent is new_domains[parent]
    def test_classification(self):
        """Following assertion about data file holds:
            - 'family' fits better to our 'group' than any other column
            - 'gene.clean', not 'Kinase' is being used as kinase name as it fits much better.
        """

        existing_kinases = {
            name: Kinase(name=name)
            for name in ('AKT1', 'Akt2', 'CIT')
        }

        existing_groups = {name: KinaseGroup(name=name) for name in ('Akt', )}

        def add_to_session():
            db.session.add_all(existing_kinases.values())
            db.session.add_all(existing_groups.values())

        filename = make_named_temp_file(raw_gene_list)

        add_to_session()

        with self.app.app_context():
            new_groups = load_kinase_classification(filename)

        assert len(new_groups) == 1
        new_group = new_groups[0]

        assert new_group.name == 'DMPK'

        add_to_session()

        assert len(new_group.kinases) == 2
        assert existing_kinases['CIT'] in new_group.kinases

        old_group = existing_groups['Akt']
        assert len(old_group.kinases) == 3

        assert existing_kinases['AKT1'] in old_group.kinases
        assert existing_kinases['Akt2'] in old_group.kinases
Beispiel #25
0
    def test_gene_lists(self):

        filename = make_named_temp_file(raw_gene_list)

        with self.app.app_context():
            gene_lists = load_active_driver_gene_lists(lists=(ListData(
                name='TCGA list', path=filename,
                mutations_source=TCGAMutation), ))

        # one gene list returned (TCGA)
        assert len(gene_lists) == 1

        gene_list = gene_lists[0]

        # correct name of gene list
        assert gene_list.name == 'TCGA list'

        assert gene_list.mutation_source_name == 'TCGA'

        # let's take some entry
        an_entry = gene_list.entries[0]

        assert type(an_entry.p) is float
        assert type(an_entry.fdr) is float

        genes = [entry.gene.name for entry in gene_list.entries]

        assert 'TMEM131' not in genes
        assert 'PNPT1' in genes

        db.session.add_all(gene_lists)

        with self.app.app_context():
            gene_lists = load_active_driver_gene_lists(lists=(ListData(
                name='TCGA list', path=filename,
                mutations_source=TCGAMutation), ))

        # no duplicate lists should be created
        assert not gene_lists
Beispiel #26
0
    def test_import(self):
        protein = Protein(
            refseq='NM_001010',
            sequence='MKLNISFPATGCQKLIEVDDERKLRTFYEKRMATEVAADALGEEWKGYVVRISGGNDKQGFPMKQGVLTHGRVRLLLSKGHSCYRPRRTGERKRKSVRGCIVDANLSVLNLVIVKKGEKDIPGLTDTTVPRRLGPKRASRIRKLFNLSKEDDVRQYVVRKPLNKEGKKPRTKAPKIQRLVTPRVLQHKRRRIALKKQRTKKNKEEAAEYAKLLAKRMKEAKEKRQEQIAKRRRLSSLRASTSKSESSQK*'
        )

        db.session.add(protein)

        with TemporaryDirectory() as dir_path:

            with gzip.open(dir_path + '/O-GalNAc_site_dataset.gz', 'wt') as f:
                f.write(SITES)

            importer = PhosphoELMImporter(
                make_named_gz_file(CANONICAL),
                make_named_gz_file(ALTERNATIVE),
                make_named_gz_file(MAPPINGS)
            )

            sites = importer.load_sites(make_named_temp_file(SITES))

            assert len(sites) == 2

            sites_by_pos = {site.position: site for site in sites}

            assert sites_by_pos[236].residue == sites_by_pos[242].residue == 'S'
            assert sites_by_pos[236].types_names == sites_by_pos[242].types_names == {'phosphorylation'}

            assert sites_by_pos[236].pmid == {17360704}

            # N.N. should be ignored
            assert sites_by_pos[242].pmid == {18669648}

            assert {kinase.name for kinase in sites_by_pos[236].kinases} == {'P70S6K'}
            assert {group.name for group in sites_by_pos[236].kinase_groups} == {'RSK'}

            assert len(sites_by_pos[242].kinases) == 0
    def test_ptm_muts_of_gene(self):

        filename = make_named_temp_file()

        with self.app.app_context():
            from models import clear_cache
            clear_cache()

            test_models = create_test_models()
            db.session.add_all(test_models.values())

            from exports.protein_data import ptm_muts_of_gene
            ptm_muts_of_gene(path_template=filename,
                             mutation_source='mc3',
                             gene='SOMEGENE',
                             site_type='glycosylation',
                             export_samples=True)

        with open(filename) as f:
            assert f.readlines() == [
                'gene\tisoform\tposition\twt_residue\tmut_residue\tcancer_type\tsample_id\n',
                'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample A\n',
                'SOMEGENE\tNM_0001\t1\tA\tE\tCAN\tSample B\n'
            ]
Beispiel #28
0
    def test_import(self):
        proteins = create_test_proteins([
            'NM_000689', 'NM_001997', 'NM_000690', 'NM_001204889', 'NM_000546'
        ])

        # Sequence is needed for validation. Validation is tested on model level.
        sequences = {
            'NM_000689':
            'MSSSGTPDLPVLLTDLKIQYTKIFINNEWHDSVSGKKFPVFNPATEEELCQVEEGDKEDVDKAVKAARQAFQIGSPWRTMDASERGRLLYKLADLIERDRLLLATMESMNGGKLYSNAYLNDLAGCIKTLRYCAGWADKIQGRTIPIDGNFFTYTRHEPIGVCGQIIPWNFPLVMLIWKIGPALSCGNTVVVKPAEQTPLTALHVASLIKEAGFPPGVVNIVPGYGPTAGAAISSHMDIDKVAFTGSTEVGKLIKEAAGKSNLKRVTLELGGKSPCIVLADADLDNAVEFAHHGVFYHQGQCCIAASRIFVEESIYDEFVRRSVERAKKYILGNPLTPGVTQGPQIDKEQYDKILDLIESGKKEGAKLECGGGPWGNKGYFVQPTVFSNVTDEMRIAKEEIFGPVQQIMKFKSLDDVIKRANNTFYGLSAGVFTKDIDKAITISSALQAGTVWVNCYGVVSAQCPFGGFKMSGNGRELGEYGFHEYTEVKTVTVKISQKNS*',
            'NM_001997':
            'MQLFVRAQELHTFEVTGQETVAQIKAHVASLEGIAPEDQVVLLAGAPLEDEATLGQCGVEALTTLEVAGRMLGGKVHGSLARAGKVRGQTPKVAKQEKKKKKTGRAKRRMQYNRRFVNVVPTFGKKKGPNANS*',
            'NM_000690':
            'MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTFPTVNPSTGEVICQVAEGDKEDVDKAVKAARAAFQLGSPWRRMDASHRGRLLNRLADLIERDRTYLAALETLDNGKPYVISYLVDLDMVLKCLRYYAGWADKYHGKTIPIDGDFFSYTRHEPVGVCGQIIPWNFPLLMQAWKLGPALATGNVVVMKVAEQTPLTALYVANLIKEAGFPPGVVNIVPGFGPTAGAAIASHEDVDKVAFTGSTEIGRVIQVAAGSSNLKRVTLELGGKSPNIIMSDADMDWAVEQAHFALFFNQGQCCCAGSRTFVQEDIYDEFVERSVARAKSRVVGNPFDSKTEQGPQVDETQFKKILGYINTGKQEGAKLLCGGGIAADRGYFIQPTVFGDVQDGMTIAKEEIFGPVMQILKFKTIEEVVGRANNSTYGLAAAVFTKDLDKANYLSQALQAGTVWVNCYDVFGAQSPFGGYKMSGSGRELGEYGLQAYTEVKTVTVKVPQKNS*',
            'NM_001204889':
            'MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTFPTVNPSTGEVICQVAEGDKALETLDNGKPYVISYLVDLDMVLKCLRYYAGWADKYHGKTIPIDGDFFSYTRHEPVGVCGQIIPWNFPLLMQAWKLGPALATGNVVVMKVAEQTPLTALYVANLIKEAGFPPGVVNIVPGFGPTAGAAIASHEDVDKVAFTGSTEIGRVIQVAAGSSNLKRVTLELGGKSPNIIMSDADMDWAVEQAHFALFFNQGQCCCAGSRTFVQEDIYDEFVERSVARAKSRVVGNPFDSKTEQGPQVDETQFKKILGYINTGKQEGAKLLCGGGIAADRGYFIQPTVFGDVQDGMTIAKEEIFGPVMQILKFKTIEEVVGRANNSTYGLAAAVFTKDLDKANYLSQALQAGTVWVNCYDVFGAQSPFGGYKMSGSGRELGEYGLQAYTEVKTVTVKVPQKNS*',
            'NM_000546':
            'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD*'
        }

        for isoform, sequence in sequences.items():
            proteins[isoform].sequence = sequence

        db.session.add_all(proteins.values())

        # Add gene to test cross-isoform mapping
        aldh2 = gene_from_isoforms(proteins, ['NM_000690', 'NM_001204889'])
        db.session.add(aldh2)

        importer = HPRDImporter(make_named_temp_file(SEQUENCES),
                                make_named_temp_file(MAPPINGS),
                                dir_path='')

        assert len(importer.mappings) == 4

        with warns(None) as warnings:
            sites = importer.load_sites(path=make_named_temp_file(SITES))

        for warning in warnings.list:
            warn(warning.message)
        assert all(warning.category is not UserWarning
                   for warning in warnings.list)

        # should have 5 pre-defined sites and one mapped (isoform NM_001204889)
        assert len(sites) == 5 + 1

        db.session.add_all(sites)
        db.session.commit()

        sites_by_isoform = {site.protein.refseq: site for site in sites}

        assert sites_by_isoform['NM_001204889'].residue == sites_by_isoform[
            'NM_000690'].residue == 'S'
        assert sites_by_isoform['NM_000689'].position == 2
        assert len(sites_by_isoform['NM_000689'].kinases) == 0

        tp_53_sites = {
            site.position: site
            for site in sites if site.protein.refseq == 'NM_000546'
        }
        tp53_6s = tp_53_sites[6]

        assert tp53_6s.residue == 'S'
        assert {kinase.name for kinase in tp53_6s.kinases} == {'CSNK1A1'}

        tp53_46s = tp_53_sites[46]
        assert {kinase.name for kinase in tp53_46s.kinases} == {'ATM', 'HIPK2'}

        phosphorylation = SiteType.query.filter_by(
            name='phosphorylation').one()
        assert list(tp53_6s.kinases)[0].is_involved_in == {phosphorylation}
Beispiel #29
0
    def test_protein_references(self):

        uniprot_filename = make_named_temp_file(data=idmapping_dat,
                                                opener=gzip.open,
                                                mode='wt')
        reflink_filename = make_named_temp_file(data=reflink_data,
                                                opener=gzip.open,
                                                mode='wt',
                                                suffix='.gz')
        refseq_filename = make_named_temp_file(data=refseq_data)

        refseqs = [
            'NM_011739',  # present in reference mappings
            'NM_001131572',  # present
            'NM_201200',  # present
            'NM_0001'  # not present in reference mappings
        ]

        g = Gene(name='Some gene')
        proteins_we_have = {
            refseq_nm: Protein(refseq=refseq_nm, gene=g)
            for refseq_nm in refseqs
        }

        tp53 = Gene(name='TP53')
        tp53_protein = Protein(refseq='NM_000546', gene=tp53)

        with self.app.app_context():
            # let's pretend that we already have some proteins in our db
            db.session.add_all(proteins_we_have.values())
            db.session.add(tp53_protein)

            references = load_external_references(uniprot_filename,
                                                  refseq_filename,
                                                  reflink_filename)

            # there are 3 references we would like to have extracted
            assert len(references) == 3

            protein = proteins_we_have['NM_011739']

            assert len(protein.external_references.uniprot_entries) == 2
            uniprot_entry = protein.external_references.uniprot_entries[1]
            assert uniprot_entry.accession == 'P68254'
            assert uniprot_entry.isoform == 1
            assert uniprot_entry.reviewed is True

            uniprot_entry = protein.external_references.uniprot_entries[0]
            assert uniprot_entry.reviewed is False

            ensembl_peptides = protein.external_references.ensembl_peptides

            assert len(ensembl_peptides) == 2
            assert (set(ensembl.peptide_id
                        for ensembl in ensembl_peptides) == {
                            'ENSMUSP00000106602', 'ENSMUSP00000100067'
                        })

            protein = proteins_we_have['NM_001131572']

            assert len(protein.external_references.uniprot_entries) == 1
            uniprot_entry = protein.external_references.uniprot_entries[0]
            assert uniprot_entry.accession == 'Q5RFJ2'
            assert uniprot_entry.isoform == 1
            assert uniprot_entry.reviewed is False

            # check if protein without references stays clear
            protein = proteins_we_have['NM_0001']

            # it's needed to re-add the protein cause ORM will emit a query
            # (just in case, that's how flask-testing works - any object needs
            # to be re-added to session after its termination)
            db.session.add(protein)

            assert protein.external_references is None

            # check the protein with refseq references and gene with entrez id
            assert tp53_protein.external_references.refseq_np == 'NP_000537'
            assert tp53.entrez_id == 7157