def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fasta import join from jcvi.formats.fastq import guessoffset from jcvi.projects.tgbs import snp p = OptionParser(align.__doc__) p.add_option("--join", default=False, action="store_true", help="Join sequences with padded 50Ns") p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--snp", default=False, action="store_true", help="Call SNPs after GSNAP") p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[0:2] if opts.join: dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"]) assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = prefix + ".log" gsnapfile = prefix + ".gsnap" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.snp: snp([gsnapfile, "--cpus={0}".format(opts.cpus)]) return gsnapfile, logfile
def assemble(args): """ %prog assemble sffdir Assemble each BAC separately using newbler. """ from jcvi.formats.fasta import join p = OptionParser(assemble.__doc__) p.add_option("--overwrite", default=False, action="store_true", help="Overwrite the separate BAC assembly [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) sffdir, = args asmdir = "newbler" fastadir = "fasta" mkdir(asmdir, overwrite=opts.overwrite) mkdir(fastadir, overwrite=opts.overwrite) cmd = "runAssembly -cpu 8 -o {0} {1}" for sffile in glob("{0}/*.sff".format(sffdir)): pf = op.basename(sffile).split(".")[1] pf = pf.lower() outdir = op.join(asmdir, pf) if op.exists(outdir): logging.debug("`{0}` exists. Ignored.".format(outdir)) continue acmd = cmd.format(outdir, sffile) sh(acmd) ctgfile = op.join(outdir, "454LargeContigs.fna") if not op.exists(ctgfile): # newbler failure logging.error("File `{0}` not found (newbler failure).".\ format(ctgfile)) continue outfile = op.join(fastadir, "{0}.fasta".format(pf)) newidopt = "--newid={0}".format(pf) minctgsizeopt = "--minctgsize=200" join([ctgfile, outfile, newidopt, minctgsizeopt])
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]" ) opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) ctgfasta = args[0] duos = list(grouper(2, args[1:])) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [bedfile, "--lib", "--nointra"] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\ format(prefix) sh(cmd) final = "final" cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \ "-merge -detail -oo -sum -o {1}".format(prefix, final) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.set_rclip(rclip=1) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option("--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]") opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) rclip = opts.rclip ctgfasta = args[0] duos = list(grouper(args[1:], 2)) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [bedfile, "--lib", "--nointra", "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff)] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\ format(prefix) sh(cmd) final = "final" cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \ "-merge -detail -oo -sum -o {1}".format(prefix, final) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])