예제 #1
0
    def test_EMBL_CCDS_RefSeq(self):
        exp = [
            CodingSequence(
                'CR456855', 'EMBL',
                Seq(
                    'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'DQ917642', 'EMBL',
                Seq(
                    'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'NM_001270952', 'RefSeq',
                Seq(
                    'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG',
                    IUPACUnambiguousDNA()),
                Seq(
                    'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA',
                    ExtendedIUPACProtein())),
            CodingSequence(
                'CCDS73586.1', 'CCDS',
                Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG'
                    ),
                Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA'
                    )),
            CodingSequence(
                'CCDS86041.1', 'CCDS',
                Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA'
                    ),
                Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK'
                    ))
        ]

        ids = {
            'EMBL': ['CR456855.1', 'DQ917642.1'],
            'RefSeq': ['NM_001270952.1'],
            'CCDS': ['CCDS73586.1', 'CCDS86041.1']
        }
        formatter = dba.UrlFormatter()
        queries = []
        for database, id_list in ids.items():
            queries += formatter.format(database, id_list)
        loop = asyncio.get_event_loop()
        fetcher = dba.Entry_fetcher()
        entries = loop.run_until_complete(fetcher.fetch_all(queries))
        splitter = dba.EntrySplitter()
        entries = splitter.split(entries)
        loop.close()
        parser = dba.DnaParser()
        res = parser.parse(entries)
        for item in res:
            self.assertTrue(item in exp)
        self.assertEqual(len(exp), len(res))
 def test_bact_non_standard_start_codon(self):
     id_list = ['AH002539']
     database = 'EMBL'
     exp = CodingSequence(id_list[0], database, eftu1_ecoli_dna_seq,
                          eftu1_ecoli_protein_seq)
     entry = self._fetch_data(database, id_list)[0][1]
     res = self.parser.parse_non_eukaryotes(database, entry)
     self.assertIn(exp, res)  # others will be there - multiple cds
 def test_euk_correct_functioning(self):
     id_list = ['AJ250042']
     database = 'EMBL'
     exp = CodingSequence(id_list[0], database, q9uj41_2_dna_seq,
                          q9uj41_2_protein_seq)
     entry = self._fetch_data(database, id_list)[0][1]
     res = self.parser.parse(database, BeautifulSoup(entry, 'xml'))
     self.assertEqual(res, exp)
 def parse(self, database, xml_soup):
     '''
     Input: Beautifulsoup(xml) of Genbank entry
     Output:
     entry_id, DNA sequence, Protein sequence
     Raises NotAnORF if DNA does not contain a coding sequence (CDS) 
     '''
     is_mrna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'mRNA')
     is_dna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'cDNA')
     if not (is_dna or is_mrna):
         raise NotAnORF
     id_ = xml_soup.find_all('GBSeq_locus')[0].text.strip()
     features = xml_soup.find_all('GBFeature')
     if not features:  # entirely unannotated entry - ultra rare
         raise SequenceNotFoundError
     found = False
     for f in features: 
         if f.GBFeature_key.text.strip() == 'CDS':
             loc = f.find_all('GBFeature_location')[0].text
             loc = loc
             #sometimes format is start...pos2,pos3..stop);
             #if pos2 != pos3, or more intervals are indicated, we ignore this entry   
             if loc.startswith('join('):
                 loc = loc[5:-1].split('..')
                 positions = []
                 for pos in loc:
                     positions = positions + pos.split(',') #pos2,pos2
                 positions = list(set(positions))
                 try:
                     assert len(positions) == 3
                     start, stop = positions[0], positions[2]
                 except (AssertionError, ValueError):
                     raise NotAnORF 
             #mostly feature location is simply encoded as start..stop;
             else:
                 start, stop = loc.split('..')
             if '<' in start or '>' in stop:  # start or stop codon not known
                 raise NotAnORF
             start, stop = int(start), int(stop)
             found = True
             break
     if not found:  # not sure this ever happens
         raise SequenceNotFoundError
     dna_seq = xml_soup.GBSeq_sequence.text.strip().upper()
     orf = dna_seq[start-1:stop] 
     try: 
         assert orf.startswith('ATG') 
         assert orf.endswith(('TAA', 'TGA', 'TAG')) 
         assert len(orf)%3 == 0 
     except AssertionError: 
         raise NotAnORF 
     cds = Seq(orf, IUPACUnambiguousDNA()) 
     return CodingSequence(id_, database, cds, cds.translate(cds=True))
 def parse(self, html_soup): 
     # title: Report for CCDS[id].[version] (current version)
     #.[version] is optional.
     #" (current version)" might not be present
     titlematcher = re.compile(r'Report for CCDS[0-9]*(?:\.[0-9]*)(?:\ \(current version\))?')
     id_ = html_soup.find_all(string=titlematcher)[0] #find() does not take kwargs
     idmatcher = r'CCDS[0-9]*(?:.[0-9]*)?'
     id_ = re.search(idmatcher, id_).group(0)
     nucleotides = html_soup.find_all('span', {'id':re.compile('n[0-9]+')})
     aminoacids = html_soup.find_all('span', {'id':re.compile('p[0-9]+')})
     dna_seq = Seq(''.join([nt.text for nt in nucleotides]), 
                   IUPACUnambiguousDNA())
     aa_seq = Seq(''.join([aa.text for aa in aminoacids]), ExtendedIUPACProtein())
     assert aa_seq == dna_seq.translate(cds=True) 
     return CodingSequence(id_, 'CCDS', dna_seq, aa_seq)
 def parse_non_eukaryotes(self, database, xml):
     xml_soup = BeautifulSoup(xml, 'xml')
     id_ = str(xml_soup.find_all('GBSeq_primary-accession')[0].text).strip()
     features = xml_soup.find_all('GBFeature')
     if not features:  # entirely unannotated entry - ultra rare
         raise SequenceNotFoundError
     found = False
     #unlike eukaryotes, there are usually multiple CDS per entry, and they
     #might be on complementary strands...
     coding_sequences = []
     for f in features: 
         if f.GBFeature_key.text.strip() == 'CDS':
             try:
                 dna_seq = Seq(xml_soup.GBSeq_sequence.text.strip().upper(),
                               IUPACUnambiguousDNA()) #don't move out of loop...
             except AttributeError: #entry does not actually have a normal sequence (e.g. HOPD_ECOLX)
                 raise SequenceNotFoundError 
             loc = f.find_all('GBFeature_location')[0].text
             start, stop = loc.split('..')
             if '<' in start or '>' in stop:  # start or stop codon not known
                 continue
             try:
                 if not 'complement('.upper() in start.upper(): #cds on sense strand
                     start, stop = int(start), int(stop)
                 elif 'complement('.upper() in start.upper(): #cds on other strand
                     #complement([start]..[stop])
                     start = int(start.split('(')[-1])
                     stop = int(stop.replace(')', '')) 
                     #reverse complement dna and remap
                     dna_seq = dna_seq.reverse_complement()
                     temp = start
                     start = len(dna_seq) - stop +1
                     stop = len(dna_seq) - temp +1
             except ValueError: #some other abstruse way of indicating starts and stops
                 continue 
             orf = dna_seq[start-1:stop]
             try: 
                 protein_seq = orf.translate(table=11, cds=True) #note that we use bacterial codon table
                 coding_sequences.append(CodingSequence(id_, database, orf, protein_seq))
                 found = True
             except TranslationError: 
                 continue #not a good CDS
     if not found:  # not sure this ever happens
         raise NotAnORF
     return coding_sequences