def splitFasta(inpFasta, maxChunkSize, outBase=None, sfxSep=None, openWriter=None, lineLen=80, verbose=True): """Split input multi-FASTA file into multiple files of fixed size""" assert (outBase is None and sfxSep is None) or openWriter is None,\ "Incompatible argument combination" if outBase is None: outBase = "" if sfxSep is None: sfxSep = '_' inpClose = False if hasattr(inpFasta,"records"): records = inpFasta.records() elif hasattr(inpFasta,"read") or is_string(inpFasta): inpFasta = FastaReader(inpFasta) inpClose = True records = inpFasta.records() else: records = inpFasta try: iChunk = 0 chunkSize = 0 out = None if openWriter is None: def _open_new_out(iChunk): return FastaWriter(outBase+'%s%04d'%(sfxSep,iChunk,), lineLen=lineLen,mode="w") else: _open_new_out = openWriter out = _open_new_out(iChunk) try: if verbose: print "Writing chunk %i of target size %i" % (iChunk,maxChunkSize) for rec in records: hdr = rec.header() idSeq = rec.getId() seq = rec.sequence() out.record(hdr,seq) lenSeq = len(seq) yield (idSeq,lenSeq) chunkSize += lenSeq # we approximate the next seq length by the last one if chunkSize + lenSeq >= maxChunkSize: out.close() out = None iChunk += 1 chunkSize = 0 out = _open_new_out(iChunk) if verbose: print "Writing chunk %i of target size %i" % (iChunk,maxChunkSize) finally: if out is not None: out.close() finally: if inpClose: inpFasta.close()
def __init__(self,infile): """Ctor. @param infile It can be either a string with a file name, or it can be an iterator that returns lines (each line should be terminated with a new line), e.g. a file object. An iterator can be a filter that reads another FastaReader object, performs transformations on the records and emits them as lines.""" if is_string(infile): infile = openCompressed(infile,'r') self.ownInfile = True else: self.ownInfile = False self.infile = infile self.freshHdr = False self.maxLineLen = 0 self.seqTotal = 0 self.symbolsTotal = 0