def sff_filter(in_file, out_file, iterator_filter, inter): count = 0 try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest if inter: from itertools import chain count = writer.write_file( chain.from_iterable( iterator_filter(pair(SffIterator(in_handle))))) assert count % 2 == 0, "Odd number of records? %i" % count count /= 2 else: count = writer.write_file( iterator_filter(SffIterator(in_handle))) return count
def sff_filter(in_file, out_file, iterator_filter, inter): count = 0 try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: sys_exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest if inter: from itertools import chain count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) assert count % 2 == 0, "Odd number of records? %i" % count count /= 2 else: count = writer.write_file(iterator_filter(SffIterator(in_handle))) #count = writer.write_file(SffIterator(in_handle)) return count
def run(self, proc_name=None): sffpath = self.id_str + '.sff' try: with open(sffpath, 'wb') as fh: self.proc_name = proc_name self.sff_file = SffWriter(fh) self.sff_file.write_file(self.reads_for_barcode( self.reads_sff)) logger.info( "%s reads of %s matched %s" % (self._matched_reads, self._processed, self.id_str)) except ValueError: # No reads for barcode so remove the temporary file os.unlink(sffpath)
def test_no_index(self): # Does a lot of work to create a no-index SFF file # (in the process checking this bit of SffWriter works) records = list(SeqIO.parse(BytesIO(self.good), "sff")) with BytesIO() as handle: writer = SffWriter(handle, index=False) count = writer.write_file(records) self.assertEqual(count, len(records)) handle.seek(0) new = list(SeqIO.parse(handle, "sff")) self.assertEqual(len(records), len(new)) for a, b in zip(records, new): self.assertEqual(a.id, b.id) handle.seek(0) with self.assertRaises(ValueError) as cm: values = _sff_find_roche_index(handle) err = str(cm.exception) self.assertEqual(err, "No index present in this SFF file")
def sff_filter(in_file, pos_file, neg_file, wanted): """SFF filter.""" try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: sys.exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None # This makes two passes though the SFF file with isn't so efficient, # but this makes the code simple. pos_count = neg_count = 0 if pos_file is not None: out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest pos_count = writer.write_file( rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted ) out_handle.close() if neg_file is not None: out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again neg_count = writer.write_file( rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted ) out_handle.close() # And we're done in_handle.close() # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. return pos_count, neg_count
def test_no_index(self): # Does a lot of work to create a no-index SFF file # (in the process checking this bit of SffWriter works) records = list(SeqIO.parse(BytesIO(self.good), "sff")) with BytesIO() as handle: writer = SffWriter(handle, index=False) count = writer.write_file(records) self.assertEqual(count, len(records)) handle.seek(0) new = list(SeqIO.parse(handle, "sff")) self.assertEqual(len(records), len(new)) for a, b in zip(records, new): self.assertEqual(a.id, b.id) handle.seek(0) try: values = _sff_find_roche_index(handle) except ValueError as err: self.assertEqual(str(err), "No index present in this SFF file") else: self.assertTrue(False, "Test _sff_find_roche_index did not raise exception")
def run( self, proc_name = None ): sffpath = self.id_str + '.sff' try: with open( sffpath, 'wb' ) as fh: self.proc_name = proc_name self.sff_file = SffWriter( fh ) self.sff_file.write_file( self.reads_for_barcode( self.reads_sff ) ) logger.info( "%s reads of %s matched %s" % (self._matched_reads, self._processed, self.id_str) ) except ValueError: # No reads for barcode so remove the temporary file os.unlink( sffpath )
def sff_filter(in_file, out_file, iterator_filter): count = 0 try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: stop_err("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest count = writer.write_file(iterator_filter(SffIterator(in_handle))) #count = writer.write_file(SffIterator(in_handle)) return count
def test_write(self): filename = "Roche/E3MFGYR02_random_10_reads.sff" with open(filename, "rb") as handle: metadata = ReadRocheXmlManifest(handle) with open(filename, "rb") as handle: sff = list(SffIterator(handle)) b_handle = BytesIO() w = SffWriter(b_handle, xml=metadata) w.write_file(sff) # list data = b_handle.getvalue() # And again with an iterator... handle = BytesIO() w = SffWriter(handle, xml=metadata) w.write_file(iter(sff)) self.assertEqual(data, handle.getvalue()) # Check 100% identical to the original: with open(filename, "rb") as handle: original = handle.read() self.assertEqual(len(data), len(original)) self.assertEqual(data, original) del data
def sff_filter(in_file, pos_file, neg_file, wanted): """SFF filter.""" try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: sys.exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None # This makes two passes though the SFF file with isn't so efficient, # but this makes the code simple. pos_count = neg_count = 0 if pos_file is not None: out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) out_handle.close() if neg_file is not None: out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) out_handle.close() # And we're done in_handle.close() # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. return pos_count, neg_count
class PGMBarcode(object): """ Represents a barcode from IonTorrent """ def __init__(self, *args, **kwargs): """ args - id_str, type, sequence, floworder, index, annotation, adapter, score_mode, score_cutoff """ self.id_str = kwargs['id_str'] self.type = kwargs['type'] self.sequence = kwargs['sequence'] self.floworder = kwargs['floworder'] self.index = kwargs['index'] self.annotation = kwargs['annotation'] self.adapter = kwargs['adapter'] self.score_mode = kwargs['score_mode'] self.score_cutoff = kwargs['score_cutoff'] self.sff_file = None self.proc_name = None self.reads_sff = kwargs['sfffilepath'] self.max_num = kwargs['max_num'] self._processed = 0 self._matched_reads = 0 def _readMatches(self, read): """ read - Bio.Seq record representing a read from sff file """ return self.sequence.lower() == self._getReadBarcode(read) def _getReadBarcode(self, read): """ Returns the barcode for a given read which should be between the flow_key and adapter sequence """ start = len(read.annotations['flow_key']) end = read.annotations['clip_adapter_left'] - len(self.adapter) seq = str(read.seq) return seq[start:end].lower() def reads_for_barcode(self, reads_file): """ Generator method returning only reads for the barcode this class instance is setup for """ for read in SeqIO.parse(reads_file, 'sff'): # Quit if max_num is reached if self.max_num != 'All' and self._processed == self.max_num: break if self._readMatches(read): logger.debug("%s: %s Matched Read %s" % (self.proc_name, self.id_str, read.id)) self._matched_reads += 1 yield read self._processed += 1 def run(self, proc_name=None): sffpath = self.id_str + '.sff' try: with open(sffpath, 'wb') as fh: self.proc_name = proc_name self.sff_file = SffWriter(fh) self.sff_file.write_file(self.reads_for_barcode( self.reads_sff)) logger.info( "%s reads of %s matched %s" % (self._matched_reads, self._processed, self.id_str)) except ValueError: # No reads for barcode so remove the temporary file os.unlink(sffpath)
if padding: padding = 8 - padding index += chr(0) * padding assert len(index) % 8 == 0 # Ugly bit of code to make a fake index at start records = list(SffIterator( open("Roche/E3MFGYR02_random_10_reads.sff", "rb"))) out_handle = open( "Roche/E3MFGYR02_alt_index_at_start.sff", "w") index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding w = SffWriter(out_handle, index=False, xml=None) # Fake the header... w._number_of_reads = len(records) w._index_start = 0 w._index_length = 0 w._key_sequence = records[0].annotations["flow_key"] w._flow_chars = records[0].annotations["flow_chars"] w._number_of_flows_per_read = len(w._flow_chars) w.write_header() w._index_start = out_handle.tell() w._index_length = len(index) out_handle.seek(0) w.write_header() # this time with index info w.handle.write(index) for record in records: w.write_record(record)
def main(): # Parse Command Line try: tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] except ValueError: stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) try: columns = [int(arg) - 1 for arg in cols_arg.split(",")] except ValueError: stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) if out_positive_file == "-" and out_negative_file == "-": stop_err("Neither output file requested") # Read tabular file and record all specified identifiers ids = set() handle = open(tabular_file, "rU") if len(columns) > 1: # General case of many columns for line in handle: if line.startswith("#"): # Ignore comments continue parts = line.rstrip("\n").split("\t") for col in columns: ids.add(parts[col]) print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) else: # Single column, special case speed up col = columns[0] for line in handle: if not line.startswith("#"): ids.add(line.rstrip("\n").split("\t")[col]) print "Using %i IDs from tabular file" % (len(ids)) handle.close() if seq_format.lower() == "sff": # Now write filtered SFF file based on IDs from BLAST file try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: stop_err("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None # This makes two passes though the SFF file with isn't so efficient, # but this makes the code simple. if out_positive_file != "-": out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) out_handle.close() if out_negative_file != "-": out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) out_handle.close() # And we're done in_handle.close() # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. if out_positive_file != "-" and out_negative_file != "-": print "%i with and %i without specified IDs" % (pos_count, neg_count) elif out_positive_file != "-": print "%i with specified IDs" % pos_count elif out_negative_file != "-": print "%i without specified IDs" % neg_count elif seq_format.lower() == "fasta": # Write filtered FASTA file based on IDs from tabular file reader = fastaReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTA files" positive_writer = fastaWriter(open(out_positive_file, "w")) negative_writer = fastaWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTA file" positive_writer = fastaWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) positive_writer.close() elif out_negative_file != "-": print "Generating non-matching FASTA file" negative_writer = fastaWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if not record.identifier or record.identifier.split()[0][1:] not in ids: negative_writer.write(record) negative_writer.close() elif seq_format.lower().startswith("fastq"): # Write filtered FASTQ file based on IDs from tabular file from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTQ files" positive_writer = fastqWriter(open(out_positive_file, "w")) negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTQ file" positive_writer = fastqWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) positive_writer.close() elif out_negative_file != "-": print "Generating non-matching FASTQ file" negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if not record.identifier or record.identifier.split()[0][1:] not in ids: negative_writer.write(record) negative_writer.close() else: stop_err("Unsupported file type %r" % seq_format)
try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.close() out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) count = 0 #This does have the overhead of parsing into SeqRecord objects, #but doing the header and index at the low level is too fidly. iterator = (records[name] for name in parse_ids(tabular_file, column)) try: count = writer.write_file(iterator) except KeyError, err: out_handle.close() if name not in records: stop_err("Identifier %r not found in sequence file" % name) else: raise err out_handle.close() else: #Avoid overhead of parsing into SeqRecord objects,
except ImportError: sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: #Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] \ or seq_format.lower().startswith("qual"): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU"))
try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None #This makes two passes though the SFF file with isn't so efficient, #but this makes the code simple. pos_count = neg_count = 0 if out_positive_file != "-": out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) out_handle.close() if out_negative_file != "-": out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) out_handle.close() #And we're done in_handle.close() #At the time of writing, Galaxy doesn't show SFF file read counts, #so it is useful to put them in stdout and thus shown in job info. print "%i with and %i without specified IDs" % (pos_count, neg_count) elif seq_format.lower()=="fasta":
try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None #This makes two passes though the SFF file with isn't so efficient, #but this makes the code simple. pos_count = neg_count = 0 if out_positive_file is not None: out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) out_handle.close() if out_negative_file is not None: out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) out_handle.close() #And we're done in_handle.close() #At the time of writing, Galaxy doesn't show SFF file read counts, #so it is useful to put them in stdout and thus shown in job info. print "%i with and %i without specified IDs" % (pos_count, neg_count) elif seq_format.lower()=="fasta":
class PGMBarcode( object ): """ Represents a barcode from IonTorrent """ def __init__( self, *args, **kwargs ): """ args - id_str, type, sequence, floworder, index, annotation, adapter, score_mode, score_cutoff """ self.id_str = kwargs['id_str'] self.type = kwargs['type'] self.sequence = kwargs['sequence'] self.floworder = kwargs['floworder'] self.index = kwargs['index'] self.annotation = kwargs['annotation'] self.adapter = kwargs['adapter'] self.score_mode = kwargs['score_mode'] self.score_cutoff = kwargs['score_cutoff'] self.sff_file = None self.proc_name = None self.reads_sff = kwargs['sfffilepath'] self.max_num = kwargs['max_num'] self._processed = 0 self._matched_reads = 0 def _readMatches( self, read ): """ read - Bio.Seq record representing a read from sff file """ return self.sequence.lower() == self._getReadBarcode( read ) def _getReadBarcode( self, read ): """ Returns the barcode for a given read which should be between the flow_key and adapter sequence """ start = len( read.annotations['flow_key'] ) end = read.annotations['clip_adapter_left'] - len( self.adapter ) seq = str( read.seq ) return seq[start:end].lower() def reads_for_barcode( self, reads_file ): """ Generator method returning only reads for the barcode this class instance is setup for """ for read in SeqIO.parse( reads_file, 'sff' ): # Quit if max_num is reached if self.max_num != 'All' and self._processed == self.max_num: break if self._readMatches( read ): logger.debug( "%s: %s Matched Read %s" % (self.proc_name, self.id_str, read.id) ) self._matched_reads += 1 yield read self._processed += 1 def run( self, proc_name = None ): sffpath = self.id_str + '.sff' try: with open( sffpath, 'wb' ) as fh: self.proc_name = proc_name self.sff_file = SffWriter( fh ) self.sff_file.write_file( self.reads_for_barcode( self.reads_sff ) ) logger.info( "%s reads of %s matched %s" % (self._matched_reads, self._processed, self.id_str) ) except ValueError: # No reads for barcode so remove the temporary file os.unlink( sffpath )
try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.close() out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) count = 0 # This does have the overhead of parsing into SeqRecord objects, # but doing the header and index at the low level is too fidly. name = None # We want the variable to leak from the iterator's scope... iterator = (records[name] for name in parse_ids(tabular_file, column)) try: count = writer.write_file(iterator) except KeyError: out_handle.close() if name not in records: sys.exit("Identifier %r not found in sequence file" % name) else: raise out_handle.close() else:
padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding assert len(index) % 8 == 0 # Ugly bit of code to make a fake index at start index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0" padding = len(index) % 8 if padding: padding = 8 - padding index += chr(0) * padding with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle: records = list(SffIterator(handle)) with open("Roche/E3MFGYR02_alt_index_at_start.sff", "w") as out_handle: w = SffWriter(out_handle, index=False, xml=None) # Fake the header... w._number_of_reads = len(records) w._index_start = 0 w._index_length = 0 w._key_sequence = records[0].annotations["flow_key"] w._flow_chars = records[0].annotations["flow_chars"] w._number_of_flows_per_read = len(w._flow_chars) w.write_header() w._index_start = out_handle.tell() w._index_length = len(index) out_handle.seek(0) w.write_header() # this time with index info w.handle.write(index) for record in records: w.write_record(record)
except ImportError: sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: # Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith( "qual" ): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"):
short_clipped += 1 elif keep_negatives: if len(seq) >= min_len: negs += 1 yield record else: short_neg += 1 in_handle = open(in_file, "rb") try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) writer.write_file(process(SffIterator(in_handle))) #End of SFF code elif seq_format.lower().startswith("fastq"): in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastqReader(in_handle) writer = fastqWriter(out_handle) if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: #Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:]
try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None #This makes two passes though the SFF file with isn't so efficient, #but this makes the code simple. pos_count = neg_count = 0 if out_positive_file is not None: out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) out_handle.close() if out_negative_file is not None: out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) out_handle.close() #And we're done in_handle.close() #At the time of writing, Galaxy doesn't show SFF file read counts, #so it is useful to put them in stdout and thus shown in job info.