def FastqPairedIterator(read1, read2): if read1 == read2: p1fp = p2fp = must_open(read1) else: p1fp = must_open(read1) p2fp = must_open(read2) return p1fp, p2fp
def splitread(args): """ %prog splitread fastqfile Split fastqfile into two read fastqfiles, cut in the middle. """ p = OptionParser(splitread.__doc__) p.add_option( "-n", dest="n", default=76, type="int", help="Split at N-th base position", ) p.add_option( "--rc", default=False, action="store_true", help="Reverse complement second read", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (pairsfastq, ) = args base = op.basename(pairsfastq).split(".")[0] fq1 = base + ".1.fastq" fq2 = base + ".2.fastq" fw1 = must_open(fq1, "w") fw2 = must_open(fq2, "w") fp = must_open(pairsfastq) n = opts.n minsize = n * 8 / 5 for name, seq, qual in FastqGeneralIterator(fp): if len(seq) < minsize: logging.error("Skipping read {0}, length={1}".format( name, len(seq))) continue name = "@" + name rec1 = FastqLite(name, seq[:n], qual[:n]) rec2 = FastqLite(name, seq[n:], qual[n:]) if opts.rc: rec2.rc() print(rec1, file=fw1) print(rec2, file=fw2) logging.debug("Reads split into `{0},{1}`".format(fq1, fq2)) fw1.close() fw2.close()
def pairinplace(args): """ %prog pairinplace bulk.fastq Pair up the records in bulk.fastq by comparing the names for adjancent records. If they match, print to bulk.pairs.fastq, else print to bulk.frags.fastq. """ from jcvi.utils.iter import pairwise p = OptionParser(pairinplace.__doc__) p.add_option("-r", dest="rclip", default=1, type="int", help="pair ID is derived from rstrip N chars [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args base = op.basename(fastqfile).split(".")[0] frags = base + ".frags.fastq" pairs = base + ".pairs.fastq" if fastqfile.endswith(".gz"): frags += ".gz" pairs += ".gz" fragsfw = must_open(frags, "w") pairsfw = must_open(pairs, "w") N = opts.rclip strip_name = lambda x: x[:-N] if N else str fh_iter = iter_fastq(fastqfile, key=strip_name) skipflag = False # controls the iterator skip for a, b in pairwise(fh_iter): if b is None: # hit the eof break if skipflag: skipflag = False continue if a.id == b.id: print >> pairsfw, a print >> pairsfw, b skipflag = True else: print >> fragsfw, a # don't forget the last one, when b is None if not skipflag: print >> fragsfw, a logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags))
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ from jcvi.utils.cbook import percentage p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(opts.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".format(sf, percentage(nselected, nreads)))
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(opts.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print(rec, file=fw) logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(opts.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print >> fw, rec logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(opts.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".\ format(sf, percentage(nselected, nreads)))
def catread(args): """ %prog catread fastqfile1 fastqfile2 Concatenate paired end reads into one. Useful for example to do single-end mapping and perform filtering on the whole read pair level. """ p = OptionParser(catread.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) r1, r2 = args p1fp, p2fp = FastqPairedIterator(r1, r2) outfile = pairspf((r1, r2)) + ".cat.fastq" fw = must_open(outfile, "w") while True: a = list(islice(p1fp, 4)) if not a: break atitle, aseq, _, aqual = a btitle, bseq, _, bqual = list(islice(p2fp, 4)) print >> fw, "\n".join((atitle.strip(), aseq.strip() + bseq.strip(), \ "+", aqual.strip() + bqual.strip()))
def splitread(args): """ %prog splitread fastqfile Split fastqfile into two read fastqfiles, cut in the middle. """ p = OptionParser(splitread.__doc__) p.add_option("-n", dest="n", default=76, type="int", help="Split at N-th base position [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement second read [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pairsfastq, = args base = op.basename(pairsfastq).split(".")[0] fq1 = base + ".1.fastq" fq2 = base + ".2.fastq" fw1 = must_open(fq1, "w") fw2 = must_open(fq2, "w") fp = must_open(pairsfastq) n = opts.n for name, seq, qual in FastqGeneralIterator(fp): name = "@" + name rec1 = FastqLite(name, seq[:n], qual[:n]) rec2 = FastqLite(name, seq[n:], qual[n:]) if opts.rc: rec2.rc() print >> fw1, rec1 print >> fw2, rec2 logging.debug("Reads split into `{0},{1}`".format(fq1, fq2)) fw1.close() fw2.close()
def splitread(args): """ %prog splitread fastqfile Split fastqfile into two read fastqfiles, cut in the middle. """ p = OptionParser(splitread.__doc__) p.add_option("-n", dest="n", default=76, type="int", help="Split at N-th base position [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement second read [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pairsfastq, = args base = op.basename(pairsfastq).split(".")[0] fq1 = base + ".1.fastq" fq2 = base + ".2.fastq" fw1 = must_open(fq1, "w") fw2 = must_open(fq2, "w") fp = must_open(pairsfastq) n = opts.n minsize = n * 8 / 5 for name, seq, qual in FastqGeneralIterator(fp): if len(seq) < minsize: logging.error("Skipping read {0}, length={1}".format(name, len(seq))) continue name = "@" + name rec1 = FastqLite(name, seq[:n], qual[:n]) rec2 = FastqLite(name, seq[n:], qual[n:]) if opts.rc: rec2.rc() print(rec1, file=fw1) print(rec2, file=fw2) logging.debug("Reads split into `{0},{1}`".format(fq1, fq2)) fw1.close() fw2.close()
def shuffle(args): """ %prog shuffle p1.fastq p2.fastq Shuffle pairs into interleaved format. """ p = OptionParser(shuffle.__doc__) p.set_tag() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) p1, p2 = args pairsfastq = pairspf((p1, p2)) + ".fastq" tag = opts.tag p1fp = must_open(p1) p2fp = must_open(p2) pairsfw = must_open(pairsfastq, "w") nreads = 0 while True: a = list(islice(p1fp, 4)) if not a: break b = list(islice(p2fp, 4)) if tag: name = a[0].rstrip() a[0] = name + "/1\n" b[0] = name + "/2\n" pairsfw.writelines(a) pairsfw.writelines(b) nreads += 2 pairsfw.close() extra = nreads * 2 if tag else 0 checkShuffleSizes(p1, p2, pairsfastq, extra=extra) logging.debug( "File `{0}` verified after writing {1} reads.".format(pairsfastq, nreads) ) return pairsfastq
def shuffle(args): """ %prog shuffle p1.fastq p2.fastq Shuffle pairs into interleaved format. """ p = OptionParser(shuffle.__doc__) p.set_tag() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) p1, p2 = args pairsfastq = pairspf((p1, p2)) + ".fastq" tag = opts.tag p1fp = must_open(p1) p2fp = must_open(p2) pairsfw = must_open(pairsfastq, "w") nreads = 0 while True: a = list(islice(p1fp, 4)) if not a: break b = list(islice(p2fp, 4)) if tag: name = a[0].rstrip() a[0] = name + "/1\n" b[0] = name + "/2\n" pairsfw.writelines(a) pairsfw.writelines(b) nreads += 2 pairsfw.close() extra = nreads * 2 if tag else 0 checkShuffleSizes(p1, p2, pairsfastq, extra=extra) logging.debug("File `{0}` verified after writing {1} reads.".\ format(pairsfastq, nreads)) return pairsfastq
def shuffle(args): """ %prog shuffle p1.fastq p2.fastq pairs.fastq Shuffle pairs into interleaved format, using `shuffleSequences_fastq.pl`. """ from itertools import izip p = OptionParser(shuffle.__doc__) p.add_option("--tag", dest="tag", default=False, action="store_true", help="add tag (/1, /2) to the read name") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) p1, p2, pairsfastq = args tag = opts.tag p1fp = must_open(p1) p2fp = must_open(p2) pairsfw = must_open(pairsfastq, "w") nreads = 0 while True: a = list(islice(p1fp, 4)) if not a: break b = list(islice(p2fp, 4)) if tag: name = a[0].rstrip() a[0] = name + "/1\n" b[0] = name + "/2\n" pairsfw.writelines(a) pairsfw.writelines(b) nreads += 2 pairsfw.close() extra = nreads * 2 if tag else 0 checkShuffleSizes(p1, p2, pairsfastq, extra=extra) logging.debug("File sizes verified after writing {0} reads.".format(nreads))
def iter_fastq(filename, offset=0, key=None): if isinstance(filename, str): logging.debug("Read file `{0}`".format(filename)) fh = must_open(filename) else: fh = filename while True: rec = FastqRecord(fh, offset=offset, key=key) if not rec.name: break yield rec yield None # sentinel
def split_barcode(t): barcode, excludebarcode, outdir, inputfile = t trim = len(barcode.seq) fp = must_open(inputfile) outfastq = op.join(outdir, barcode.id + ".fastq") fw = open(outfastq, "w") for title, seq, qual in FastqGeneralIterator(fp): if seq[:trim] != barcode.seq: continue hasexclude = any(seq.startswith(x.seq) for x in excludebarcode) if hasexclude: continue print >> fw, "@{0}\n{1}\n+\n{2}".format(title, seq[trim:], qual[trim:]) fw.close()
def split_barcode(t): barcode, excludebarcode, site, outdir, inputfile = t trim = len(barcode.seq) fp = must_open(inputfile) outfastq = op.join(outdir, barcode.id + ".fastq") fw = open(outfastq, "w") for title, seq, qual in FastqGeneralIterator(fp): if seq[:trim] != barcode.seq: continue hasexclude = any(seq.startswith(x.seq) for x in excludebarcode) if hasexclude: continue seq = seq[trim:] hassite = any(seq.startswith(x) for x in site) if not hassite: continue print >> fw, "@{0}\n{1}\n+\n{2}".format(title, seq, qual[trim:]) fw.close()
def pairinplace(args): """ %prog pairinplace bulk.fastq Pair up the records in bulk.fastq by comparing the names for adjancent records. If they match, print to bulk.pairs.fastq, else print to bulk.frags.fastq. """ from jcvi.utils.iter import pairwise p = OptionParser(pairinplace.__doc__) p.set_rclip() p.set_tag() p.add_option("--base", help="Base name for the output files [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args base = opts.base or op.basename(fastqfile).split(".")[0] frags = base + ".frags.fastq" pairs = base + ".pairs.fastq" if fastqfile.endswith(".gz"): frags += ".gz" pairs += ".gz" fragsfw = must_open(frags, "w") pairsfw = must_open(pairs, "w") N = opts.rclip tag = opts.tag strip_name = (lambda x: x[:-N]) if N else None fh_iter = iter_fastq(fastqfile, key=strip_name) skipflag = False # controls the iterator skip for a, b in pairwise(fh_iter): if b is None: # hit the eof break if skipflag: skipflag = False continue if a.name == b.name: if tag: a.name += "/1" b.name += "/2" print(a, file=pairsfw) print(b, file=pairsfw) skipflag = True else: print(a, file=fragsfw) # don't forget the last one, when b is None if not skipflag: print(a, file=fragsfw) logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags)) return pairs