def test_Gene_format_list_of_ids(self):
     id_ = ['7347', '50933']
     url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=Gene&id={",".join(id_)}&retmode=xml&email={self.email}&tool=ccd.rhpc.nki.nl'
     exp = [('Gene', url)]
     formatter = dba.UrlFormatter()
     res = formatter.format('Gene', id_)
     self.assertEqual(res, exp)
Ejemplo n.º 2
0
    def test_EMBL_CCDS_RefSeq(self):
        exp = [
            CodingSequence(
                'CR456855', 'EMBL',
                Seq(
                    'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'DQ917642', 'EMBL',
                Seq(
                    'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'NM_001270952', 'RefSeq',
                Seq(
                    'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'CCDS73586.1', 'CCDS',
                Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG'
                    ),
                Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA'
                    )),
            CodingSequence(
                'CCDS86041.1', 'CCDS',
                Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA'
                    ),
                Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK'
                    ))
        ]

        ids = {
            'EMBL': ['CR456855.1', 'DQ917642.1'],
            'RefSeq': ['NM_001270952.1'],
            'CCDS': ['CCDS73586.1', 'CCDS86041.1']
        }
        formatter = dba.UrlFormatter()
        queries = []
        for database, id_list in ids.items():
            queries += formatter.format(database, id_list)
        loop = asyncio.get_event_loop()
        fetcher = dba.Entry_fetcher()
        entries = loop.run_until_complete(fetcher.fetch_all(queries))
        splitter = dba.EntrySplitter()
        entries = splitter.split(entries)
        loop.close()
        parser = dba.DnaParser()
        res = parser.parse(entries)
        for item in res:
            self.assertTrue(item in exp)
        self.assertEqual(len(exp), len(res))
 def test_EMBL_format_list_of_ids(self):
     id_ = ['CR456855.1', 'GBYX01232236.1']
     url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={",".join(id_)}&retmode=xml&rettype=gb&email={self.email}&tool=ccd.rhpc.nki.nl'
     exp = [('EMBL', url)]
     formatter = dba.UrlFormatter()
     res = formatter.format('EMBL', id_)
     self.assertEqual(res, exp)
 def test_uniprot_no_format(self):
     formatter = dba.UrlFormatter()
     with self.assertRaises(ValueError):
         formatter.format('Uniprot', 'q9uj41')  # no format
     with self.assertRaises(ValueError):
         formatter.format('Uniprot', 'q9uj41',
                          format_='notvalid')  # unsupported format
 def test_RefSeq_format_list_of_ids(self):
     id_ = ['NM_001270952.1', 'NM_006002.4']
     url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={",".join(id_)}&retmode=xml&rettype=gb&email={self.email}&tool=ccd.rhpc.nki.nl'
     exp = [('RefSeq', url)]
     formatter = dba.UrlFormatter()
     res = formatter.format('RefSeq', id_)
     self.assertEqual(res, exp)
 def test_uniprot_all_formats(self):
     id_ = 'q9uj41'
     formatter = dba.UrlFormatter()
     for format_ in ['xml', 'fasta', 'html']:
         url = f'https://www.uniprot.org/uniprot/{id_}.{format_}'
         exp = [('Uniprot', url)]
         res = formatter.format('Uniprot', id_, format_=format_)
         self.assertEqual(res, exp)
 def test_RefSeq_single_id(self):
     id_ = 'NM_001270952.1'
     exp = [(
         'RefSeq',
         f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={id_}&retmode=xml&rettype=gb&email={self.email}&tool=ccd.rhpc.nki.nl'
     )]
     formatter = dba.UrlFormatter()
     res = formatter.format('RefSeq', id_)
     self.assertEqual(res, exp)
 def test_Gene_format_single_id(self):
     id_ = '7347'
     exp = [(
         'Gene',
         f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=Gene&id={id_}&retmode=xml&email={self.email}&tool=ccd.rhpc.nki.nl'
     )]
     formatter = dba.UrlFormatter()
     res = formatter.format('Gene', id_)
     self.assertEqual(res, exp)
 def test_CCDS_single_id(self):
     id_ = 'CCDS73586.1'
     exp = [(
         'CCDS',
         f'https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA={id_}&ORGANISM=0&BUILDS=CURRENTBUILDS'
     )]
     formatter = dba.UrlFormatter()
     res = formatter.format('CCDS', id_)
     self.assertEqual(res, exp)
 def test_CCDS_format_list_of_ids(self):
     id_list = ['CCDS73586.1', 'CCDS86041.1']
     urls = [
         f'https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA={id_}&ORGANISM=0&BUILDS=CURRENTBUILDS'
         for id_ in id_list
     ]
     exp = [('CCDS', url) for url in urls]
     formatter = dba.UrlFormatter()
     res = formatter.format('CCDS', id_list)
     self.assertEqual(res, exp)
 def test_correct_functioning(self):
     loop = asyncio.get_event_loop()
     parser = dba.GeneParser()
     uf = dba.UrlFormatter()
     fetcher = dba.Entry_fetcher()
     query = uf.format('Gene', '27342')
     res = loop.run_until_complete(fetcher.fetch_all(query))
     xml_soup = BeautifulSoup(res[0][1], 'xml')
     exp = ['CCDS69308.1', 'CCDS5535.1', 'CCDS75610.1']
     self.assertEqual(exp.sort(),
                      parser.get_crossreferences(xml_soup).sort())
 def test_uniprot_batch(self):
     formatter = dba.UrlFormatter()
     id_list = ['q9uj41', 'Q15287', 'Q96RL1']
     formats = ['xml', 'html', 'fasta']  # batch added below
     for f in formats:
         exp = [(
             'Uniprot',
             f'https://www.uniprot.org/uploadlists?query=q9uj41 Q15287 Q96RL1&from=ACC+ID&to=ACC&format={f}'
         )]
         res = formatter.format('Uniprot', id_list, format_=f'batch_{f}')
         self.assertEqual(res, exp)
 def test_format_for_summary(self):
     query1 = ('EMBL', ['CR456855.1', 'DQ917642.1'])  # list of ids
     query2 = ('RefSeq', 'NM_001270952.1')  # single id
     exp = [
         ('Summary',
          'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=CR456855.1,DQ917642.1'
          ),
         ('Summary',
          'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=NM_001270952.1'
          )
     ]
     formatter = dba.UrlFormatter()
     res = formatter.format_for_summary(*query1)
     res += formatter.format_for_summary(*query2)
     self.assertEqual(exp, res)
 def _fetch_data(self, database, id_list):
     loop = asyncio.get_event_loop()
     uf = dba.UrlFormatter()
     fetcher = dba.Entry_fetcher()
     query = uf.format(database, id_list)
     return loop.run_until_complete(fetcher.fetch_all(query))
 def test_invalid_database(self):
     formatter = dba.UrlFormatter()
     with self.assertRaises(ValueError):
         formatter.format('NotADatabase', ['id1', 'id2'])
     with self.assertRaises(ValueError):
         formatter.format_for_summary('NotADatabase', ['id1', 'id2'])