def read_single_with_titles(filename, alphabet): global title_to_ids iterator = FastaIterator(open(filename), alphabet, title_to_ids) record = iterator.next() try: second = iterator.next() except StopIteration: second = None assert record is not None and second is None return record
def PairedFastaQualIterator(fasta_handle, qual_handle, alphabet = single_letter_alphabet, title2ids = None) : """Iterate over matched FASTA and QUAL files as SeqRecord objects. For example, consider this short QUAL file:: >EAS54_6_R1_2_1_413_324 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 26 26 23 23 >EAS54_6_R1_2_1_540_792 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 26 18 26 23 18 >EAS54_6_R1_2_1_443_348 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 24 18 18 18 18 And a matching FASTA file:: >EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC >EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA >EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG You can parse these separately using Bio.SeqIO with the "qual" and "fasta" formats, but then you'll get a group of SeqRecord objects with no sequence, and a matching group with the sequence but not the qualities. Because it only deals with one input file handle, Bio.SeqIO can't be used to read the two files together - but this function can! For example, >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), ... open("Quality/example.qual", "rU")) >>> for record in rec_iter : ... print record.id, record.seq EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG As with the FASTQ or QUAL parsers, if you want to look at the qualities, they are in each record's per-letter-annotation dictionary as a simple list of integers: >>> print record.letter_annotations["phred_quality"] [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] If you have access to data as a FASTQ format file, using that directly would be simpler and more straight forward. Note that you can easily use this function to convert paired FASTA and QUAL files into FASTQ files: >>> from Bio import SeqIO >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), ... open("Quality/example.qual", "rU")) >>> out_handle = open("Quality/temp.fastq", "w") >>> SeqIO.write(rec_iter, out_handle, "fastq") 3 >>> out_handle.close() And don't forget to clean up the temp file if you don't need it anymore: >>> import os >>> os.remove("Quality/temp.fastq") """ from Bio.SeqIO.FastaIO import FastaIterator fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \ title2ids=title2ids) qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \ title2ids=title2ids) #Using zip(...) would create a list loading everything into memory! #It would also not catch any extra records found in only one file. while True : try : f_rec = fasta_iter.next() except StopIteration : f_rec = None try : q_rec = qual_iter.next() except StopIteration : q_rec = None if f_rec is None and q_rec is None : #End of both files break if f_rec is None : raise ValueError("FASTA file has more entries than the QUAL file.") if q_rec is None : raise ValueError("QUAL file has more entries than the FASTA file.") if f_rec.id != q_rec.id : raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \ % (f_rec.id, q_rec.id)) if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]) : raise ValueError("Sequence length and number of quality scores disagree for %s" \ % f_rec.id) #Merge the data.... f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations["phred_quality"] yield f_rec
def PairedFastaQualIterator(fasta_handle, qual_handle, alphabet=single_letter_alphabet, title2ids=None): """Iterate over matched FASTA and QUAL files as SeqRecord objects. For example, consider this short QUAL file:: >EAS54_6_R1_2_1_413_324 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 26 26 23 23 >EAS54_6_R1_2_1_540_792 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 26 18 26 23 18 >EAS54_6_R1_2_1_443_348 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 24 18 18 18 18 And a matching FASTA file:: >EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC >EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA >EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG You can parse these separately using Bio.SeqIO with the "qual" and "fasta" formats, but then you'll get a group of SeqRecord objects with no sequence, and a matching group with the sequence but not the qualities. Because it only deals with one input file handle, Bio.SeqIO can't be used to read the two files together - but this function can! For example, >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), ... open("Quality/example.qual", "rU")) >>> for record in rec_iter : ... print record.id, record.seq EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG As with the FASTQ or QUAL parsers, if you want to look at the qualities, they are in each record's per-letter-annotation dictionary as a simple list of integers: >>> print record.letter_annotations["phred_quality"] [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] If you have access to data as a FASTQ format file, using that directly would be simpler and more straight forward. Note that you can easily use this function to convert paired FASTA and QUAL files into FASTQ files: >>> from Bio import SeqIO >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), ... open("Quality/example.qual", "rU")) >>> out_handle = open("Quality/temp.fastq", "w") >>> SeqIO.write(rec_iter, out_handle, "fastq") 3 >>> out_handle.close() And don't forget to clean up the temp file if you don't need it anymore: >>> import os >>> os.remove("Quality/temp.fastq") """ from Bio.SeqIO.FastaIO import FastaIterator fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \ title2ids=title2ids) qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \ title2ids=title2ids) #Using zip(...) would create a list loading everything into memory! #It would also not catch any extra records found in only one file. while True: try: f_rec = fasta_iter.next() except StopIteration: f_rec = None try: q_rec = qual_iter.next() except StopIteration: q_rec = None if f_rec is None and q_rec is None: #End of both files break if f_rec is None: raise ValueError("FASTA file has more entries than the QUAL file.") if q_rec is None: raise ValueError("QUAL file has more entries than the FASTA file.") if f_rec.id != q_rec.id: raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \ % (f_rec.id, q_rec.id)) if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]): raise ValueError("Sequence length and number of quality scores disagree for %s" \ % f_rec.id) #Merge the data.... f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations[ "phred_quality"] yield f_rec