def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq") p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option("--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print >> fw, headerTemplate.format(libID=libname) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".format(i, frgfile, j))
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option( "--clean", default=False, action="store_true", help="Clean up irregular chars in seq", ) p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option( "--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)", ) p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug( "Sequence {0} (size={1}) longer than max read len {2}". format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print(headerTemplate.format(libID=libname), file=fw) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug( "A total of {0} fragments written to `{1}` ({2} discarded).".format( i, frgfile, j))
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("-m", dest="matefile", default=None, help="matepairs file") p.add_option("--maxreadlen", default=32000, type="int", help="Maximum read length allowed [default: %default]") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen f = Fasta(fastafile, lazy=True) if maxreadlen > 0: split = False for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\ format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = "SangerFrags-" + plate frgfile = libname + ".frg" cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta qualfile = make_qual(fastafile, score=21) if mated: if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".\ format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile)