def test_EMBL_CCDS_RefSeq(self): exp = [ CodingSequence( 'CR456855', 'EMBL', Seq( 'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA', IUPACUnambiguousDNA()), Seq( 'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA', ExtendedIUPACProtein())), CodingSequence( 'DQ917642', 'EMBL', Seq( 'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA', IUPACUnambiguousDNA()), Seq( 'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK', ExtendedIUPACProtein())), CodingSequence( 'NM_001270952', 'RefSeq', Seq( 'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG', IUPACUnambiguousDNA()), Seq( 'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA', ExtendedIUPACProtein())), CodingSequence( 'CCDS73586.1', 'CCDS', Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG' ), Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA' )), CodingSequence( 'CCDS86041.1', 'CCDS', Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA' ), Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK' )) ] ids = { 'EMBL': ['CR456855.1', 'DQ917642.1'], 'RefSeq': ['NM_001270952.1'], 'CCDS': ['CCDS73586.1', 'CCDS86041.1'] } formatter = dba.UrlFormatter() queries = [] for database, id_list in ids.items(): queries += formatter.format(database, id_list) loop = asyncio.get_event_loop() fetcher = dba.Entry_fetcher() entries = loop.run_until_complete(fetcher.fetch_all(queries)) splitter = dba.EntrySplitter() entries = splitter.split(entries) loop.close() parser = dba.DnaParser() res = parser.parse(entries) for item in res: self.assertTrue(item in exp) self.assertEqual(len(exp), len(res))
def test_bact_non_standard_start_codon(self): id_list = ['AH002539'] database = 'EMBL' exp = CodingSequence(id_list[0], database, eftu1_ecoli_dna_seq, eftu1_ecoli_protein_seq) entry = self._fetch_data(database, id_list)[0][1] res = self.parser.parse_non_eukaryotes(database, entry) self.assertIn(exp, res) # others will be there - multiple cds
def test_euk_correct_functioning(self): id_list = ['AJ250042'] database = 'EMBL' exp = CodingSequence(id_list[0], database, q9uj41_2_dna_seq, q9uj41_2_protein_seq) entry = self._fetch_data(database, id_list)[0][1] res = self.parser.parse(database, BeautifulSoup(entry, 'xml')) self.assertEqual(res, exp)
def parse(self, database, xml_soup): ''' Input: Beautifulsoup(xml) of Genbank entry Output: entry_id, DNA sequence, Protein sequence Raises NotAnORF if DNA does not contain a coding sequence (CDS) ''' is_mrna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'mRNA') is_dna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'cDNA') if not (is_dna or is_mrna): raise NotAnORF id_ = xml_soup.find_all('GBSeq_locus')[0].text.strip() features = xml_soup.find_all('GBFeature') if not features: # entirely unannotated entry - ultra rare raise SequenceNotFoundError found = False for f in features: if f.GBFeature_key.text.strip() == 'CDS': loc = f.find_all('GBFeature_location')[0].text loc = loc #sometimes format is start...pos2,pos3..stop); #if pos2 != pos3, or more intervals are indicated, we ignore this entry if loc.startswith('join('): loc = loc[5:-1].split('..') positions = [] for pos in loc: positions = positions + pos.split(',') #pos2,pos2 positions = list(set(positions)) try: assert len(positions) == 3 start, stop = positions[0], positions[2] except (AssertionError, ValueError): raise NotAnORF #mostly feature location is simply encoded as start..stop; else: start, stop = loc.split('..') if '<' in start or '>' in stop: # start or stop codon not known raise NotAnORF start, stop = int(start), int(stop) found = True break if not found: # not sure this ever happens raise SequenceNotFoundError dna_seq = xml_soup.GBSeq_sequence.text.strip().upper() orf = dna_seq[start-1:stop] try: assert orf.startswith('ATG') assert orf.endswith(('TAA', 'TGA', 'TAG')) assert len(orf)%3 == 0 except AssertionError: raise NotAnORF cds = Seq(orf, IUPACUnambiguousDNA()) return CodingSequence(id_, database, cds, cds.translate(cds=True))
def parse(self, html_soup): # title: Report for CCDS[id].[version] (current version) #.[version] is optional. #" (current version)" might not be present titlematcher = re.compile(r'Report for CCDS[0-9]*(?:\.[0-9]*)(?:\ \(current version\))?') id_ = html_soup.find_all(string=titlematcher)[0] #find() does not take kwargs idmatcher = r'CCDS[0-9]*(?:.[0-9]*)?' id_ = re.search(idmatcher, id_).group(0) nucleotides = html_soup.find_all('span', {'id':re.compile('n[0-9]+')}) aminoacids = html_soup.find_all('span', {'id':re.compile('p[0-9]+')}) dna_seq = Seq(''.join([nt.text for nt in nucleotides]), IUPACUnambiguousDNA()) aa_seq = Seq(''.join([aa.text for aa in aminoacids]), ExtendedIUPACProtein()) assert aa_seq == dna_seq.translate(cds=True) return CodingSequence(id_, 'CCDS', dna_seq, aa_seq)
def parse_non_eukaryotes(self, database, xml): xml_soup = BeautifulSoup(xml, 'xml') id_ = str(xml_soup.find_all('GBSeq_primary-accession')[0].text).strip() features = xml_soup.find_all('GBFeature') if not features: # entirely unannotated entry - ultra rare raise SequenceNotFoundError found = False #unlike eukaryotes, there are usually multiple CDS per entry, and they #might be on complementary strands... coding_sequences = [] for f in features: if f.GBFeature_key.text.strip() == 'CDS': try: dna_seq = Seq(xml_soup.GBSeq_sequence.text.strip().upper(), IUPACUnambiguousDNA()) #don't move out of loop... except AttributeError: #entry does not actually have a normal sequence (e.g. HOPD_ECOLX) raise SequenceNotFoundError loc = f.find_all('GBFeature_location')[0].text start, stop = loc.split('..') if '<' in start or '>' in stop: # start or stop codon not known continue try: if not 'complement('.upper() in start.upper(): #cds on sense strand start, stop = int(start), int(stop) elif 'complement('.upper() in start.upper(): #cds on other strand #complement([start]..[stop]) start = int(start.split('(')[-1]) stop = int(stop.replace(')', '')) #reverse complement dna and remap dna_seq = dna_seq.reverse_complement() temp = start start = len(dna_seq) - stop +1 stop = len(dna_seq) - temp +1 except ValueError: #some other abstruse way of indicating starts and stops continue orf = dna_seq[start-1:stop] try: protein_seq = orf.translate(table=11, cds=True) #note that we use bacterial codon table coding_sequences.append(CodingSequence(id_, database, orf, protein_seq)) found = True except TranslationError: continue #not a good CDS if not found: # not sure this ever happens raise NotAnORF return coding_sequences