def sff_filter(in_file, out_file, iterator_filter, inter): count = 0 try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest if inter: from itertools import chain count = writer.write_file( chain.from_iterable( iterator_filter(pair(SffIterator(in_handle))))) assert count % 2 == 0, "Odd number of records? %i" % count count /= 2 else: count = writer.write_file( iterator_filter(SffIterator(in_handle))) return count
def test_read(self): filename = "Roche/E3MFGYR02_random_10_reads.sff" with open(filename, "rb") as handle: sff = list(SffIterator(handle)) with open(filename, "rb") as handle: sff_trim = list(SffIterator(handle, trim=True)) filename = "Roche/E3MFGYR02_random_10_reads_no_trim.fasta" fasta_no_trim = list(SeqIO.parse(filename, "fasta")) filename = "Roche/E3MFGYR02_random_10_reads_no_trim.qual" qual_no_trim = list(SeqIO.parse(filename, "qual")) filename = "Roche/E3MFGYR02_random_10_reads.fasta" fasta_trim = list(SeqIO.parse(filename, "fasta")) filename = "Roche/E3MFGYR02_random_10_reads.qual" qual_trim = list(SeqIO.parse(filename, "qual")) for s, sT, f, q, fT, qT in zip( sff, sff_trim, fasta_no_trim, qual_no_trim, fasta_trim, qual_trim ): self.assertEqual(len({s.id, f.id, q.id}), 1) # All values are the same self.assertEqual(s.seq, f.seq) self.assertEqual( s.letter_annotations["phred_quality"], q.letter_annotations["phred_quality"], ) self.assertEqual( len({s.id, sT.id, fT.id, qT.id}), 1 ) # All values are the same self.assertEqual(sT.seq, fT.seq) self.assertEqual( sT.letter_annotations["phred_quality"], qT.letter_annotations["phred_quality"], )
def test_both_ways(self): filename = "Roche/E3MFGYR02_random_10_reads.sff" with open(filename, "rb") as handle: index1 = sorted(_sff_read_roche_index(handle)) with open(filename, "rb") as handle: index2 = sorted(_sff_do_slow_index(handle)) self.assertEqual(index1, index2) with open(filename, "rb") as handle: self.assertEqual(len(index1), len(list(SffIterator(handle)))) with open(filename, "rb") as handle: self.assertEqual(len(index1), len(list(SffIterator(BytesIO(handle.read()))))) if sys.platform != "win32" and sys.version_info[0] < 3: # Can be lazy and treat as binary... with open(filename, "r") as handle: self.assertEqual(len(index1), len(list(SffIterator(handle)))) with open(filename) as handle: index2 = sorted(_sff_read_roche_index(handle)) self.assertEqual(index1, index2) with open(filename, "r") as handle: index2 = sorted(_sff_do_slow_index(handle)) self.assertEqual(index1, index2) with open(filename, "r") as handle: self.assertEqual(len(index1), len(list(SffIterator(handle)))) with open(filename, "r") as handle: self.assertEqual( len(index1), len(list(SffIterator(BytesIO(handle.read())))))
def test_both_ways(self): filename = "Roche/E3MFGYR02_random_10_reads.sff" with open(filename, "rb") as handle: index1 = sorted(_sff_read_roche_index(handle)) with open(filename, "rb") as handle: index2 = sorted(_sff_do_slow_index(handle)) self.assertEqual(index1, index2) with open(filename, "rb") as handle: self.assertEqual(len(index1), len(list(SffIterator(handle)))) with open(filename, "rb") as handle: self.assertEqual(len(index1), len(list(SffIterator(BytesIO(handle.read())))))
def sff_filter(in_file, pos_file, neg_file, wanted): """SFF filter.""" try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: sys.exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None # This makes two passes though the SFF file with isn't so efficient, # but this makes the code simple. pos_count = neg_count = 0 if pos_file is not None: out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) out_handle.close() if neg_file is not None: out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) out_handle.close() # And we're done in_handle.close() # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. return pos_count, neg_count
def test_read_wrong(self): filename = "Roche/greek.sff" with open(filename, "rb") as handle: self.assertRaises(ValueError, ReadRocheXmlManifest, handle) with open(filename, "rb") as handle: for record in SffIterator(handle): pass def fileiter(handle): for record in SffIterator(handle): # print(record.id) i = record.id self.assertRaises(ValueError, fileiter, handle)
class TestAlternativeIndexes(unittest.TestCase): filename = "Roche/E3MFGYR02_random_10_reads.sff" with open(filename, "rb") as handle: sff = list(SffIterator(handle)) def check_same(self, new_sff): self.assertEqual(len(self.sff), len(new_sff)) for old, new in zip(self.sff, new_sff): self.assertEqual(old.id, new.id) self.assertEqual(old.seq, new.seq) def test_alt_index_at_end(self): with open("Roche/E3MFGYR02_alt_index_at_end.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2) def test_alt_index_at_start(self): with open("Roche/E3MFGYR02_alt_index_at_start.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2) def test_alt_index_in_middle(self): with open("Roche/E3MFGYR02_alt_index_in_middle.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2) def test_index_at_start(self): with open("Roche/E3MFGYR02_index_at_start.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2) def test_index_in_middle(self): with open("Roche/E3MFGYR02_index_in_middle.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2) def test_trim(self): with open(self.filename, "rb") as handle: sff_trim = list(SffIterator(handle, trim=True)) self.assertEqual(len(self.sff), len(sff_trim)) for old, new in zip(self.sff, sff_trim): self.assertEqual(old.id, new.id)
def test_write(self): filename = "Roche/E3MFGYR02_random_10_reads.sff" with open(filename, "rb") as handle: metadata = ReadRocheXmlManifest(handle) with open(filename, "rb") as handle: sff = list(SffIterator(handle)) b_handle = BytesIO() w = SffWriter(b_handle, xml=metadata) w.write_file(sff) # list data = b_handle.getvalue() # And again with an iterator... handle = BytesIO() w = SffWriter(handle, xml=metadata) w.write_file(iter(sff)) self.assertEqual(data, handle.getvalue()) # Check 100% identical to the original: with open(filename, "rb") as handle: original = handle.read() self.assertEqual(len(data), len(original)) self.assertEqual(data, original) del data
def sff_filter(in_file, out_file, iterator_filter): count = 0 try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: stop_err("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest count = writer.write_file(iterator_filter(SffIterator(in_handle))) #count = writer.write_file(SffIterator(in_handle)) return count
elif keep_negatives: if len(seq) >= min_len: negs += 1 yield record else: short_neg += 1 in_handle = open(in_file, "rb") try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) writer.write_file(process(SffIterator(in_handle))) #End of SFF code elif seq_format.lower().startswith("fastq"): in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastqReader(in_handle) writer = fastqWriter(out_handle) if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: #Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len:
def fileiter(handle): for record in SffIterator(handle): # print(record.id) i = record.id
def test_trim(self): with open(self.filename, "rb") as handle: sff_trim = list(SffIterator(handle, trim=True)) self.assertEqual(len(self.sff), len(sff_trim)) for old, new in zip(self.sff, sff_trim): self.assertEqual(old.id, new.id)
try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: # Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith( "qual" ): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
def test_index_in_middle(self): with open("Roche/E3MFGYR02_index_in_middle.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2)
def test_alt_index_at_start(self): with open("Roche/E3MFGYR02_alt_index_at_start.sff", "rb") as handle: sff2 = list(SffIterator(handle)) self.check_same(sff2)
except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None #This makes two passes though the SFF file with isn't so efficient, #but this makes the code simple. pos_count = neg_count = 0 if out_positive_file is not None: out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) out_handle.close() if out_negative_file is not None: out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) out_handle.close() #And we're done in_handle.close() #At the time of writing, Galaxy doesn't show SFF file read counts, #so it is useful to put them in stdout and thus shown in job info. print "%i with and %i without specified IDs" % (pos_count, neg_count) elif seq_format.lower() == "fasta":
# Ugly code to make test files... index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding assert len(index) % 8 == 0 # Ugly bit of code to make a fake index at start index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle: records = list(SffIterator(handle)) with open("Roche/E3MFGYR02_alt_index_at_start.sff", "w") as out_handle: w = SffWriter(out_handle, index=False, xml=None) # Fake the header... w._number_of_reads = len(records) w._index_start = 0 w._index_length = 0 w._key_sequence = records[0].annotations["flow_key"] w._flow_chars = records[0].annotations["flow_chars"] w._number_of_flows_per_read = len(w._flow_chars) w.write_header() w._index_start = out_handle.tell() w._index_length = len(index) out_handle.seek(0) w.write_header() # this time with index info w.handle.write(index)
if __name__ == "__main__": runner = unittest.TextTestRunner(verbosity=2) unittest.main(testRunner=runner) if False: # Ugly code to make test files... index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding assert len(index) % 8 == 0 # Ugly bit of code to make a fake index at start records = list( SffIterator(open("Roche/E3MFGYR02_random_10_reads.sff", "rb"))) out_handle = open("Roche/E3MFGYR02_alt_index_at_start.sff", "w") index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding w = SffWriter(out_handle, index=False, xml=None) # Fake the header... w._number_of_reads = len(records) w._index_start = 0 w._index_length = 0 w._key_sequence = records[0].annotations["flow_key"] w._flow_chars = records[0].annotations["flow_chars"] w._number_of_flows_per_read = len(w._flow_chars) w.write_header()