def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fasta import join from jcvi.formats.fastq import guessoffset from jcvi.projects.tgbs import snp p = OptionParser(align.__doc__) p.add_option("--join", default=False, action="store_true", help="Join sequences with padded 50Ns") p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--snp", default=False, action="store_true", help="Call SNPs after GSNAP") p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[0:2] if opts.join: dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"]) assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = prefix + ".log" gsnapfile = prefix + ".gsnap" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.snp: snp([gsnapfile, "--cpus={0}".format(opts.cpus)]) return gsnapfile, logfile
def gmap(args): """ %prog gmap database.fasta fastafile Wrapper for `gmap`. """ p = OptionParser(gmap.__doc__) p.add_option("--cross", default=False, action="store_true", help="Cross-species alignment") p.add_option( "--npaths", default=0, type="int", help="Maximum number of paths to show." " If set to 0, prints two paths if chimera" " detected, else one.", ) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) dbfile, fastafile = args assert op.exists(dbfile) and op.exists(fastafile) prefix = get_prefix(fastafile, dbfile) logfile = prefix + ".log" gmapfile = prefix + ".gmap.gff3" if not need_update((dbfile, fastafile), gmapfile): logging.error("`{0}` exists. `gmap` already run.".format(gmapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gmap -D {0} -d {1}".format(dbdir, dbname) cmd += " -f 2 --intronlength=100000" # Output format 2 cmd += " -t {0}".format(opts.cpus) cmd += " --npaths {0}".format(opts.npaths) if opts.cross: cmd += " --cross-species" cmd += " " + fastafile sh(cmd, outfile=gmapfile, errfile=logfile) return gmapfile, logfile
def gmap(args): """ %prog gmap database.fasta fastafile Wrapper for `gmap`. """ p = OptionParser(gmap.__doc__) p.add_option("--cross", default=False, action="store_true", help="Cross-species alignment") p.add_option("--npaths", default=0, type="int", help="Maximum number of paths to show." " If set to 0, prints two paths if chimera" " detected, else one.") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) dbfile, fastafile = args assert op.exists(dbfile) and op.exists(fastafile) prefix = get_prefix(fastafile, dbfile) logfile = prefix + ".log" gmapfile = prefix + ".gmap.gff3" if not need_update((dbfile, fastafile), gmapfile): logging.error("`{0}` exists. `gmap` already run.".format(gmapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gmap -D {0} -d {1}".format(dbdir, dbname) cmd += " -f 2 --intronlength=100000" # Output format 2 cmd += " -t {0}".format(opts.cpus) cmd += " --npaths {0}".format(opts.npaths) if opts.cross: cmd += " --cross-species" cmd += " " + fastafile sh(cmd, outfile=gmapfile, errfile=logfile) return gmapfile, logfile
def snpflow(args): """ %prog snpflow trimmed reference.fasta Run SNP calling pipeline until allele_counts are generated. This includes generation of native files, SNP_Het file. Speedup for fragmented genomes are also supported. """ p = OptionParser(snpflow.__doc__) p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, ref = args nseqs = len(Fasta(ref)) supercat = nseqs >= 1000 if supercat: logging.debug("Total seqs in ref: {0} (supercat={1})".\ format(nseqs, supercat)) reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure nativedir, countsdir = "native", "allele_counts" for d in (nativedir, countsdir): mkdir(d) mm = MakeManager() # Step 0 - index database db = op.join(*check_index(ref, supercat=supercat, go=False)) cmd = "python -m jcvi.apps.gmap index {0}".format(ref) if supercat: cmd += " --supercat" coordsfile = db + ".coords" supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta" mm.add(ref, (db, coordsfile), cmd) else: mm.add(ref, db, cmd) # Step 1 - GSNAP alignment and conversion to native file allnatives = [] allsamstats = [] gmapdb = supercatfile if supercat else ref for f in reads: prefix = get_prefix(f, ref) gsnapfile = op.join(nativedir, prefix + ".gsnap") nativefile = op.join(nativedir, prefix + ".unique.native") samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats") cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f) cmd += " --outdir={0} --native --cpus=1".format(nativedir) mm.add((f, db), nativefile, cmd) cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\ format(gsnapfile, gmapdb) mm.add(nativefile, samstatsfile, cmd) allnatives.append(nativefile) allsamstats.append(samstatsfile) # Step 2 - call SNP discovery if supercat: nativeconverted = nativedir + "-converted" mkdir(nativeconverted) allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives] cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl" cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted) cmd += " -c {0}".format(coordsfile) cmds = ["rm -rf {0}".format(nativeconverted), cmd] mm.add(allnatives + [coordsfile], allnativesc, cmds) runfile = "speedup.sh" write_file(runfile, speedupsh.format(nativeconverted, opts.cpus)) nativedir = nativeconverted allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] mm.add(allnativesc, allsnps, "./{0}".format(runfile)) else: for s in samples: snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s)) cmd = "SNP_Discovery-short.pl" cmd += " -native {0}/{1}.*unique.native".format(nativedir, s) cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile) flist = [x for x in allnatives if op.basename(x).split(".")[0] == s] mm.add(flist, snpfile, cmd) # Step 3 - generate equal file allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] for s in samples: equalfile = op.join(nativedir, "{0}.equal".format(s)) cmd = "extract_reference_alleles.pl" cmd += " --native {0}/{1}.*unique.native".format(nativedir, s) cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s) cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, equalfile) mm.add(allsnps, equalfile, cmd) # Step 4 - generate snp matrix allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples] matrix = "snps.matrix.txt" cmd = "generate_matrix.pl" cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, matrix) mm.add(allsnps + allequals, matrix, cmd) # Step 5 - generate allele counts allcounts = [] for s in samples: allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s)) cmd = "count_reads_per_allele.pl -m snps.matrix.txt" cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir) cmd += " -o {0}".format(allele_counts) mm.add(matrix, allele_counts, cmd) allcounts.append(allele_counts) # Step 6 - generate raw snps rawsnps = "Genotyping.H3.txt" cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3" cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps) cmds = ["rm -f {0}".format(rawsnps), cmd] mm.add(allcounts, rawsnps, cmds) # Step 7 - generate alignment report sam_summary = "sam.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary) mm.add(allsamstats, sam_summary, cmd) native_summary = "native.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary) mm.add(allnatives, native_summary, cmd) mm.write()
def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--native", default=False, action="store_true", help="Convert GSNAP output to NATIVE format") p.set_home("eddyyeh") p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[:2] outdir = opts.outdir assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = op.join(outdir, prefix + ".log") gsnapfile = op.join(outdir, prefix + ".gsnap") nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.native: EYHOME = opts.eddyyeh_home if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) return gsnapfile, logfile
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == "+-": extra += "" elif mo == "-+": extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = get_abs_path(dbfile) safile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile( readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam ) logfile = prefix + ".log" offset = guessoffset([readfile]) if not need_update(safile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) cmd = output_bam(cmd, samfile) sh(cmd) print >>sys.stderr, open(logfile).read() return samfile, logfile
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.add_option("--null", default=False, action="store_true", help="Do not write to SAM/BAM output") p.add_option("--fasta", default=False, action="store_true", help="Query reads are FASTA") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == '+-': extra += "" elif mo == '-+': extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped fasta = opts.fasta gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile(readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam) logfile = prefix + ".log" if not fasta: offset = guessoffset([readfile]) if not need_update(dbfile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) if fasta: cmd += " -f" else: cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) if opts.null: samfile = "/dev/null" cmd = output_bam(cmd, samfile) sh(cmd) print(open(logfile).read(), file=sys.stderr) return samfile, logfile